* Train a Logistic regression model for multi-class classification on the accompanied dataset (Dry bean Dataset)
* The zipped file contains both the dataset as an excel file and the dataset description 
* Notes:
    * Use "pd.read_excel()" method => to read an excel file into a dataframe object
    * Split the training data into 67% and the testing data into 33%
    * Train the logistic regression model for 250 iterations on the training data
    * Show the trained model accuracy on the testing data
    * Achieve an Accuray >= 90%

# Imports

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

# Logistic Regression model training  

In [3]:
data = pd.read_excel("Dry_Bean_Dataset.xlsx",header = None)
data.columns = ['Area(A)', 'Perimeter(P)', 'Major axis length(L)', 'Minor axis length(l)', 'Aspect ratio(K)','Eccentricity(Ec)','Convex area(C)','Equivalent diameter(Ed)','Extent(Ex)','Solidity(S)','Roundness(R)','Compactness(CO)','Shape Factor1 (SF1)','Shape Factor2 (SF2)','Shape Factor3 (SF3)','Shape Factor4 (SF4)','Class']
data['Class'] = data['Class'].map({'SEKER': 0, 'BARBUNYA': 1, 'BOMBAY':2,'CALI':3,'DERMASON':4,'HOROZ':5,'SIRA':6})
data.head()


Unnamed: 0,Area(A),Perimeter(P),Major axis length(L),Minor axis length(l),Aspect ratio(K),Eccentricity(Ec),Convex area(C),Equivalent diameter(Ed),Extent(Ex),Solidity(S),Roundness(R),Compactness(CO),Shape Factor1 (SF1),Shape Factor2 (SF2),Shape Factor3 (SF3),Shape Factor4 (SF4),Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,0
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,0
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,0
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,0
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,0


In [4]:
inputs = data.drop('Class', axis=1).values
inputs[:5]

array([[2.83950000e+04, 6.10291000e+02, 2.08178117e+02, 1.73888747e+02,
        1.19719142e+00, 5.49812187e-01, 2.87150000e+04, 1.90141097e+02,
        7.63922518e-01, 9.88855999e-01, 9.58027126e-01, 9.13357755e-01,
        7.33150614e-03, 3.14728917e-03, 8.34222388e-01, 9.98723889e-01],
       [2.87340000e+04, 6.38018000e+02, 2.00524796e+02, 1.82734419e+02,
        1.09735646e+00, 4.11785251e-01, 2.91720000e+04, 1.91272750e+02,
        7.83968133e-01, 9.84985603e-01, 8.87033637e-01, 9.53860842e-01,
        6.97865928e-03, 3.56362371e-03, 9.09850506e-01, 9.98430331e-01],
       [2.93800000e+04, 6.24110000e+02, 2.12826130e+02, 1.75931143e+02,
        1.20971266e+00, 5.62727317e-01, 2.96900000e+04, 1.93410904e+02,
        7.78113248e-01, 9.89558774e-01, 9.47849473e-01, 9.08774239e-01,
        7.24391184e-03, 3.04773322e-03, 8.25870617e-01, 9.99066137e-01],
       [3.00080000e+04, 6.45884000e+02, 2.10557999e+02, 1.82516516e+02,
        1.15363806e+00, 4.98615976e-01, 3.07240000e+04, 1.954

In [7]:
outputs = data['Class'].values
outputs[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size = 0.33, random_state = 0)


In [9]:
scaler_X_train = preprocessing.StandardScaler().fit(X_train)
scaler_X_test = preprocessing.StandardScaler().fit(X_test)

X_train_scaled = scaler_X_train.transform(X_train)
X_test_scaled = scaler_X_test.transform(X_test)

X_train[0] # before scaling

array([4.71790000e+04, 8.09861000e+02, 2.99122128e+02, 2.01352883e+02,
       1.48556169e+00, 7.39509552e-01, 4.77030000e+04, 2.45092163e+02,
       7.10869696e-01, 9.89015366e-01, 9.03935879e-01, 8.19371555e-01,
       6.34015405e-03, 1.76280026e-03, 6.71369745e-01, 9.97361170e-01])

In [10]:
X_train_scaled[0] #data is now scaled 

array([-0.20657498, -0.21610567, -0.24731967, -0.03270894, -0.38595417,
       -0.11107076, -0.20995841, -0.14156826, -0.80466111,  0.39180142,
        0.50424185,  0.30405632, -0.18530273,  0.07352   ,  0.26814732,
        0.51921631])

In [11]:
X_test_scaled

array([[-0.53981215, -0.68385855, -0.65333573, ...,  0.50610738,
         0.36346211,  0.43285771],
       [-0.83710924, -1.02792638, -0.95260002, ...,  0.65811807,
        -0.00909789,  0.58808013],
       [-0.50367994, -0.6371212 , -0.58542834, ...,  0.38335931,
         0.25725705,  0.82773561],
       ...,
       [-0.63961266, -0.77849504, -0.7635852 , ...,  0.60263574,
         0.31223935,  0.81079197],
       [-0.08175705, -0.07817247, -0.06776312, ..., -0.14418301,
         0.07788761,  0.32046453],
       [ 3.82312525,  3.16883149,  2.60866872, ..., -1.15143367,
         0.66003524,  0.46093265]])

In [12]:
model = LogisticRegression(multi_class='multinomial', max_iter=250)

In [13]:
y_train = y_train.flatten() #requires 1d array 
model.fit(X_train_scaled, y_train) #training model

LogisticRegression(max_iter=250, multi_class='multinomial')

# Logistic Regression model Evaluation on the testing dataset

In [14]:
y_preds = model.predict(X_test_scaled)

In [15]:
y_preds

array([4, 4, 4, ..., 4, 6, 2], dtype=int64)

In [16]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       625
           1       0.92      0.90      0.91       433
           2       1.00      1.00      1.00       146
           3       0.91      0.95      0.93       555
           4       0.92      0.93      0.93      1167
           5       0.96      0.95      0.95       659
           6       0.88      0.86      0.87       907

    accuracy                           0.92      4492
   macro avg       0.93      0.93      0.93      4492
weighted avg       0.92      0.92      0.92      4492

