In [1]:
# Import dependencies
import pandas as pd

In [2]:
# Read the csv and perform basic data cleaning
bean_df = pd.read_csv("Resources/Dry_Bean_Dataset.csv")
# Drop the null columns where all values are null
bean_df = bean_df.dropna(axis='columns', how='all')
# Drop the null rows
bean_df = bean_df.dropna()
bean_df

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.998430,SEKER
2,29380,624.110,212.826130,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.333680,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.941900,0.999166,SEKER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13606,42097,759.696,288.721612,185.944705,1.552728,0.765002,42508,231.515799,0.714574,0.990331,0.916603,0.801865,0.006858,0.001749,0.642988,0.998385,DERMASON
13607,42101,757.499,281.576392,190.713136,1.476439,0.735702,42494,231.526798,0.799943,0.990752,0.922015,0.822252,0.006688,0.001886,0.676099,0.998219,DERMASON
13608,42139,759.321,281.539928,191.187979,1.472582,0.734065,42569,231.631261,0.729932,0.989899,0.918424,0.822730,0.006681,0.001888,0.676884,0.996767,DERMASON
13609,42147,763.779,283.382636,190.275731,1.489326,0.741055,42667,231.653248,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.668237,0.995222,DERMASON


In [3]:
# Identify labels (y-value)
y = bean_df["Class"]
# y

In [4]:
# Drop the y-label to use for the test
X = bean_df.drop("Class", axis=1)
feature_names = X.columns
# data.head()
print(X.shape, y.shape)

(13611, 16) (13611,)


In [5]:
# Create a train-test-split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
# Create a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
print(f"Training Data Score: {rf.score(X_train, y_train)}")
print(f"Testing Data Score: {rf.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9212459594475463


In [7]:
# Make predictions
predictions = rf.predict(X_test)
print(f"First 10 Predictions: {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions: ['SEKER' 'BARBUNYA' 'SEKER' 'SEKER' 'DERMASON' 'SEKER' 'CALI' 'SEKER'
 'BOMBAY' 'DERMASON']
First 10 Actual labels: ['SEKER', 'BARBUNYA', 'SEKER', 'SEKER', 'DERMASON', 'SEKER', 'CALI', 'SEKER', 'BOMBAY', 'DERMASON']


In [8]:
# Show dataframe
final_bean_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
final_bean_df.head()

Unnamed: 0,Prediction,Actual
0,SEKER,SEKER
1,BARBUNYA,BARBUNYA
2,SEKER,SEKER
3,SEKER,SEKER
4,DERMASON,DERMASON


In [9]:
# Calculate feature importance
importances = rf.feature_importances_
importances

array([0.06484564, 0.09215949, 0.07603069, 0.09063226, 0.06170369,
       0.05434532, 0.06865415, 0.05901753, 0.01157556, 0.01745516,
       0.05616848, 0.09237766, 0.08022837, 0.04977492, 0.0951067 ,
       0.02992437])

In [10]:
# Sort the features by their importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.09510669633425918, 'ShapeFactor3'),
 (0.09237765839506995, 'Compactness'),
 (0.09215948788677406, 'Perimeter'),
 (0.0906322622538386, 'MinorAxisLength'),
 (0.08022836703339033, 'ShapeFactor1'),
 (0.07603069083744825, 'MajorAxisLength'),
 (0.06865415211738095, 'ConvexArea'),
 (0.06484563570944307, 'Area'),
 (0.0617036894877331, 'AspectRation'),
 (0.059017532501438694, 'EquivDiameter'),
 (0.056168484546275384, 'roundness'),
 (0.054345318544251266, 'Eccentricity'),
 (0.04977492390601329, 'ShapeFactor2'),
 (0.029924374675582996, 'ShapeFactor4'),
 (0.017455163415411326, 'Solidity'),
 (0.011575562355689595, 'Extent')]

In [11]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["seker", "barbunya", "horoz", "sira", "demason", "bombay", "cali"]))

              precision    recall  f1-score   support

       seker       0.93      0.90      0.92       344
    barbunya       1.00      1.00      1.00       142
       horoz       0.93      0.93      0.93       402
        sira       0.90      0.92      0.91       851
     demason       0.97      0.96      0.96       486
      bombay       0.95      0.94      0.94       513
        cali       0.87      0.87      0.87       665

    accuracy                           0.92      3403
   macro avg       0.93      0.93      0.93      3403
weighted avg       0.92      0.92      0.92      3403

