<a href="https://colab.research.google.com/github/pemba007/dry-bean-ml/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [525]:
# Installing required packages if required
# !pip install scikit-plot
# !pip install hmmlearn
# !pip install sequentia

In [526]:
# Ignoring warnings
import warnings
warnings.filterwarnings(action='ignore')

In [527]:
# Importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [528]:
# Reading the dataset
df = pd.read_excel('./Dry_Bean_Dataset.xlsx')

In [529]:
df.shape

(13611, 17)

In [530]:
# Checking the dataset
df.describe()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
count,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0
mean,53048.284549,855.283459,320.141867,202.270714,1.583242,0.750895,53768.200206,253.06422,0.749733,0.987143,0.873282,0.799864,0.006564,0.001716,0.64359,0.995063
std,29324.095717,214.289696,85.694186,44.970091,0.246678,0.092002,29774.915817,59.17712,0.049086,0.00466,0.05952,0.061713,0.001128,0.000596,0.098996,0.004366
min,20420.0,524.736,183.601165,122.512653,1.024868,0.218951,20684.0,161.243764,0.555315,0.919246,0.489618,0.640577,0.002778,0.000564,0.410339,0.947687
25%,36328.0,703.5235,253.303633,175.84817,1.432307,0.715928,36714.5,215.068003,0.718634,0.98567,0.832096,0.762469,0.0059,0.001154,0.581359,0.993703
50%,44652.0,794.941,296.883367,192.431733,1.551124,0.764441,45178.0,238.438026,0.759859,0.988283,0.883157,0.801277,0.006645,0.001694,0.642044,0.996386
75%,61332.0,977.213,376.495012,217.031741,1.707109,0.810466,62294.0,279.446467,0.786851,0.990013,0.916869,0.83427,0.007271,0.00217,0.696006,0.997883
max,254616.0,1985.37,738.860153,460.198497,2.430306,0.911423,263261.0,569.374358,0.866195,0.994677,0.990685,0.987303,0.010451,0.003665,0.974767,0.999733


In [531]:
y = df['Class']
df.drop(columns=['Class'], inplace=True)

# Preprocessing

## Normalization

In [532]:
# Columns for normalization
from sklearn.preprocessing import Normalizer

normalizeFeatures = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'ConvexArea', 'EquivDiameter']

## Skewness

In [533]:
df.skew()

Area               2.952931
Perimeter          1.626124
MajorAxisLength    1.357815
MinorAxisLength    2.238211
AspectRation       0.582573
Eccentricity      -1.062824
ConvexArea         2.941821
EquivDiameter      1.948958
Extent            -0.895348
Solidity          -2.550093
roundness         -0.635749
Compactness        0.037115
ShapeFactor1      -0.534141
ShapeFactor2       0.301226
ShapeFactor3       0.242481
ShapeFactor4      -2.759483
dtype: float64

In [534]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

ct = ColumnTransformer([("norm", Normalizer(norm='l1'), normalizeFeatures)])

pipe = Pipeline([('normalization', ct), ('pca', PCA())])

In [535]:
X = pipe.fit_transform(df)

In [536]:
# Label Encoding the output variable
from sklearn.preprocessing import LabelEncoder

y = LabelEncoder().fit_transform(y)

In [537]:
X.shape

(13611, 7)

# Training Model

In [538]:
# Defining function to get the data of particular class

def getClassData(cls, dataframe, y):
  # print('Given class', dataframe)
  # Getting the indexes for specific class
  indexes = np.where(y == cls)
  # print("Indexes", indexes)
  return dataframe[indexes]

In [539]:
import scikitplot as skplt
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc, roc_auc_score, multilabel_confusion_matrix, accuracy_score
from hmmlearn import hmm

from itertools import cycle

# Sequentia
from sequentia.classifiers import GMMHMM, HMMClassifier

skf = StratifiedKFold(n_splits=2)

fold = 0

actual_classes = list()
predicted_classes = list()

for train_index, test_index in skf.split(X, y):
  fold += 1
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

  # Handling classifiers for each classes
  hmms = list()
  for classIndex in range(7):
    classValues_train = getClassData(classIndex, X_train, y_train)
    # print("classValues", classValues_train)
    model = hmm.GaussianHMM(n_components=3, covariance_type="full")
    model.fit(classValues_train)
    proba = model.predict(X_test)
    print("Xtrain", X_test[0])
    print("probability is", proba)

Xtrain [-3.53112448e-03  1.03124442e-04  4.72785555e-04 -1.62106582e-05
  1.66227273e-05  3.82456331e-07 -4.48186806e-17]
probability is [2 2 2 ... 2 2 2]
Xtrain [-3.53112448e-03  1.03124442e-04  4.72785555e-04 -1.62106582e-05
  1.66227273e-05  3.82456331e-07 -4.48186806e-17]
probability is [1 1 1 ... 1 1 1]
Xtrain [-3.53112448e-03  1.03124442e-04  4.72785555e-04 -1.62106582e-05
  1.66227273e-05  3.82456331e-07 -4.48186806e-17]
probability is [1 1 1 ... 1 1 1]
Xtrain [-3.53112448e-03  1.03124442e-04  4.72785555e-04 -1.62106582e-05
  1.66227273e-05  3.82456331e-07 -4.48186806e-17]
probability is [1 1 1 ... 0 0 0]
Xtrain [-3.53112448e-03  1.03124442e-04  4.72785555e-04 -1.62106582e-05
  1.66227273e-05  3.82456331e-07 -4.48186806e-17]
probability is [1 1 1 ... 1 1 1]
Xtrain [-3.53112448e-03  1.03124442e-04  4.72785555e-04 -1.62106582e-05
  1.66227273e-05  3.82456331e-07 -4.48186806e-17]
probability is [2 1 1 ... 1 1 1]
Xtrain [-3.53112448e-03  1.03124442e-04  4.72785555e-04 -1.62106582e-0