In [1]:
# Import dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 

In [2]:
# Read the csv and perform basic data cleaning
bean_df = pd.read_csv("Resources/Dry_Bean_Dataset.csv")
# Drop the null columns where all values are null
bean_df = bean_df.dropna(axis='columns', how='all')
# Drop the null rows
bean_df = bean_df.dropna()
bean_df

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.998430,SEKER
2,29380,624.110,212.826130,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.333680,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.941900,0.999166,SEKER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13606,42097,759.696,288.721612,185.944705,1.552728,0.765002,42508,231.515799,0.714574,0.990331,0.916603,0.801865,0.006858,0.001749,0.642988,0.998385,DERMASON
13607,42101,757.499,281.576392,190.713136,1.476439,0.735702,42494,231.526798,0.799943,0.990752,0.922015,0.822252,0.006688,0.001886,0.676099,0.998219,DERMASON
13608,42139,759.321,281.539928,191.187979,1.472582,0.734065,42569,231.631261,0.729932,0.989899,0.918424,0.822730,0.006681,0.001888,0.676884,0.996767,DERMASON
13609,42147,763.779,283.382636,190.275731,1.489326,0.741055,42667,231.653248,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.668237,0.995222,DERMASON


In [3]:
# Identify labels (y-value)
y = bean_df["Class"]
# y

In [4]:
# Drop the y-label to use for the test
X = bean_df.drop("Class", axis=1)
feature_names = X.columns
# X.head()
print(X.shape, y.shape)

(13611, 16) (13611,)


In [5]:
# Create a train-test-split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
# Create a Decision Tree Classifier
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(f"Decision Tree Training Data Score: {clf.score(X_train, y_train)}")
print(f"Decision Tree Testing Data Score: {clf.score(X_test, y_test)}")

Decision Tree Training Data Score: 1.0
Decision Tree Testing Data Score: 0.8971495739053776


In [8]:
# Calculate feature importances for Decision Tree
importances_dt = clf.feature_importances_
importances_dt

array([0.01230326, 0.15874964, 0.18435676, 0.07370809, 0.00454505,
       0.00608872, 0.00915947, 0.00488419, 0.0149879 , 0.01952707,
       0.0626548 , 0.21426094, 0.18042758, 0.00568701, 0.01714479,
       0.03151474])

In [9]:
# Sort the features by their importance
sorted(zip(clf.feature_importances_, feature_names), reverse=True)

[(0.21426094171857188, 'Compactness'),
 (0.18435676217521973, 'MajorAxisLength'),
 (0.18042757547329477, 'ShapeFactor1'),
 (0.15874963965866176, 'Perimeter'),
 (0.07370809031411647, 'MinorAxisLength'),
 (0.0626547971719066, 'roundness'),
 (0.03151474493157733, 'ShapeFactor4'),
 (0.019527069110310026, 'Solidity'),
 (0.017144785004604316, 'ShapeFactor3'),
 (0.014987898144235113, 'Extent'),
 (0.012303262891550716, 'Area'),
 (0.009159471103753234, 'ConvexArea'),
 (0.00608872154062706, 'Eccentricity'),
 (0.005687006818579612, 'ShapeFactor2'),
 (0.004884188003577489, 'EquivDiameter'),
 (0.004545045939413969, 'AspectRation')]

In [10]:
# Create a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
print(f"Training Data Score: {rf.score(X_train, y_train)}")
print(f"Testing Data Score: {rf.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9215398178078167


In [11]:
# Make predictions with Random Forest Classifier
predictions = rf.predict(X_test)
print(f"First 10 Predictions: {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions: ['SEKER' 'BARBUNYA' 'SEKER' 'SEKER' 'DERMASON' 'SEKER' 'CALI' 'SEKER'
 'BOMBAY' 'DERMASON']
First 10 Actual labels: ['SEKER', 'BARBUNYA', 'SEKER', 'SEKER', 'DERMASON', 'SEKER', 'CALI', 'SEKER', 'BOMBAY', 'DERMASON']


In [12]:
# Show dataframe
final_bean_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
final_bean_df.head(20)

Unnamed: 0,Prediction,Actual
0,SEKER,SEKER
1,BARBUNYA,BARBUNYA
2,SEKER,SEKER
3,SEKER,SEKER
4,DERMASON,DERMASON
5,SEKER,SEKER
6,CALI,CALI
7,SEKER,SEKER
8,BOMBAY,BOMBAY
9,DERMASON,DERMASON


In [13]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["seker", "barbunya", "dermason", "cali", "bombay", "sira", "horoz"]))

              precision    recall  f1-score   support

       seker       0.93      0.89      0.91       344
    barbunya       1.00      1.00      1.00       142
    dermason       0.92      0.94      0.93       402
        cali       0.90      0.93      0.91       851
      bombay       0.97      0.96      0.96       486
        sira       0.94      0.94      0.94       513
       horoz       0.88      0.86      0.87       665

    accuracy                           0.92      3403
   macro avg       0.93      0.93      0.93      3403
weighted avg       0.92      0.92      0.92      3403



In [14]:
# Calculate feature importance
importances = rf.feature_importances_
importances

array([0.0633998 , 0.10345877, 0.07481604, 0.09283114, 0.06145069,
       0.0683995 , 0.0696753 , 0.05804528, 0.0113787 , 0.01751397,
       0.05341949, 0.08995063, 0.07801882, 0.04040014, 0.0870535 ,
       0.03018825])

In [15]:
# Sort the features by their importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.1034587718784712, 'Perimeter'),
 (0.09283113679255182, 'MinorAxisLength'),
 (0.08995063069777127, 'Compactness'),
 (0.0870535039054199, 'ShapeFactor3'),
 (0.07801881567456453, 'ShapeFactor1'),
 (0.07481603964692894, 'MajorAxisLength'),
 (0.06967530347645484, 'ConvexArea'),
 (0.06839949556151931, 'Eccentricity'),
 (0.06339979683959877, 'Area'),
 (0.06145068812780765, 'AspectRation'),
 (0.058045276891591176, 'EquivDiameter'),
 (0.05341949147599749, 'roundness'),
 (0.04040013796745037, 'ShapeFactor2'),
 (0.030188245735295532, 'ShapeFactor4'),
 (0.01751396980838185, 'Solidity'),
 (0.011378695520195421, 'Extent')]