<a href="https://colab.research.google.com/github/pemba007/dry-bean-ml/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [115]:
# Installing required packages if required
!pip install scikit-plot
!pip install hmmlearn



In [116]:
# Ignoring warnings
import warnings
warnings.filterwarnings(action='ignore')

In [117]:
# Importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [118]:
# Reading the dataset
df = pd.read_excel('./Dry_Bean_Dataset.xlsx')

In [119]:
df.shape

(13611, 17)

In [120]:
# Checking the dataset
df.describe()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
count,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0
mean,53048.284549,855.283459,320.141867,202.270714,1.583242,0.750895,53768.200206,253.06422,0.749733,0.987143,0.873282,0.799864,0.006564,0.001716,0.64359,0.995063
std,29324.095717,214.289696,85.694186,44.970091,0.246678,0.092002,29774.915817,59.17712,0.049086,0.00466,0.05952,0.061713,0.001128,0.000596,0.098996,0.004366
min,20420.0,524.736,183.601165,122.512653,1.024868,0.218951,20684.0,161.243764,0.555315,0.919246,0.489618,0.640577,0.002778,0.000564,0.410339,0.947687
25%,36328.0,703.5235,253.303633,175.84817,1.432307,0.715928,36714.5,215.068003,0.718634,0.98567,0.832096,0.762469,0.0059,0.001154,0.581359,0.993703
50%,44652.0,794.941,296.883367,192.431733,1.551124,0.764441,45178.0,238.438026,0.759859,0.988283,0.883157,0.801277,0.006645,0.001694,0.642044,0.996386
75%,61332.0,977.213,376.495012,217.031741,1.707109,0.810466,62294.0,279.446467,0.786851,0.990013,0.916869,0.83427,0.007271,0.00217,0.696006,0.997883
max,254616.0,1985.37,738.860153,460.198497,2.430306,0.911423,263261.0,569.374358,0.866195,0.994677,0.990685,0.987303,0.010451,0.003665,0.974767,0.999733


In [121]:
y = df['Class']
df.drop(columns=['Class'], inplace=True)

# Preprocessing

## Normalization

In [122]:
# Columns for normalization
from sklearn.preprocessing import Normalizer

normalizeFeatures = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'ConvexArea', 'EquivDiameter']

## Skewness

In [123]:
df.skew()

Area               2.952931
Perimeter          1.626124
MajorAxisLength    1.357815
MinorAxisLength    2.238211
AspectRation       0.582573
Eccentricity      -1.062824
ConvexArea         2.941821
EquivDiameter      1.948958
Extent            -0.895348
Solidity          -2.550093
roundness         -0.635749
Compactness        0.037115
ShapeFactor1      -0.534141
ShapeFactor2       0.301226
ShapeFactor3       0.242481
ShapeFactor4      -2.759483
dtype: float64

In [124]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

ct = ColumnTransformer([("norm", Normalizer(norm='l1'), normalizeFeatures)])

pipe = Pipeline([('normalization', ct), ('pca', PCA())])

In [125]:
X = pipe.fit_transform(df)

In [126]:
type(X)

numpy.ndarray

In [127]:
# Label Encoding the output variable
from sklearn.preprocessing import LabelEncoder

y = LabelEncoder().fit_transform(y)

In [128]:
X.shape

(13611, 7)

In [129]:
y.shape

(13611,)

# Training Model

In [130]:
# Defining function to get the data of particular class

def getClassData(cls, dataframe, y):
  # Getting the indexes for specific class
  indexes = np.where(y == cls)
  return dataframe[indexes], y[indexes]

In [131]:
# Getting the predicted values

def getPredictions(predictions):
  output_classes = list()

  # Looping for features
  # Calculating probailities

  probabilities = list()
  for x in range(len(predictions[0])):
    all_proba = list()
    # Adding all probabilities for each classes
    all_proba.append(predictions[0][x])
    all_proba.append(predictions[1][x])
    all_proba.append(predictions[2][x])
    all_proba.append(predictions[3][x])
    all_proba.append(predictions[4][x])
    all_proba.append(predictions[5][x])
    all_proba.append(predictions[6][x])
    max_value = max(all_proba)
    max_index = all_proba.index(max_value)
    output_classes.append(max_index)
  return output_classes

In [132]:
# Get probabilities for the class
def getProbabilities(prob, classIndex):
  info = {
      0: 0,
      1: 1,
      2: 1,
      3: 1,
      4: 1,
      5: 0,
      6: 1
  }
  actual_prob = list()
  for x in prob:
    actual_prob.append(x[info[classIndex]])
  return actual_prob

In [133]:
import scikitplot as skplt
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc, roc_auc_score, multilabel_confusion_matrix, accuracy_score
from hmmlearn import hmm

skf = StratifiedKFold(n_splits=5)

fold = 0

actual_classes = list()
predicted_classes = list()

for train_index, test_index in skf.split(X, y):
  fold += 1
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

  print(f"Fold {fold}")

  probabilities = list()

  # Handling classifiers for each classes
  for classIndex in range(7):
    print("ClassIndex", classIndex)
    classValues_train, y_values = getClassData(classIndex, X_train, y_train)
    model = hmm.GaussianHMM(n_components = 2, algorithm="viterbi", covariance_type = "diag", n_iter = 500)
    model.fit(classValues_train)
    proba = model.predict_proba(X_test)    
    probs = getProbabilities(proba, classIndex)
    probabilities.append(probs)

  final_predictions = getPredictions(probabilities)
  final_predictions = np.array(final_predictions)
  print(f"Accuracy for fold {fold} - ", accuracy_score(y_test, final_predictions) * 100, "%")

Fold 1
ClassIndex 0
ClassIndex 1
ClassIndex 2
ClassIndex 3
ClassIndex 4
ClassIndex 5
ClassIndex 6
Accuracy for fold 1 -  3.8560411311053984 %
Fold 2
ClassIndex 0
ClassIndex 1
ClassIndex 2
ClassIndex 3
ClassIndex 4
ClassIndex 5
ClassIndex 6
Accuracy for fold 2 -  3.8207200587803087 %
Fold 3
ClassIndex 0
ClassIndex 1
ClassIndex 2
ClassIndex 3
ClassIndex 4
ClassIndex 5
ClassIndex 6
Accuracy for fold 3 -  9.73548861131521 %
Fold 4
ClassIndex 0
ClassIndex 1
ClassIndex 2
ClassIndex 3
ClassIndex 4
ClassIndex 5
ClassIndex 6
Accuracy for fold 4 -  9.73548861131521 %
Fold 5
ClassIndex 0
ClassIndex 1
ClassIndex 2
ClassIndex 3
ClassIndex 4
ClassIndex 5
ClassIndex 6
Accuracy for fold 5 -  9.698750918442322 %
