In [1]:
import pandas as pd 
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [2]:
data = pd.read_csv("toy_ds.csv")
print(data.head(), '\n \n ', data.describe())

  Sex  Length  Diameter  Height  Whole weight  Shucked weight  Viscera weight  \
0   M   0.455     0.365   0.095        0.5140          0.2245          0.1010   
1   M   0.350     0.265   0.090        0.2255          0.0995          0.0485   
2   F   0.530     0.420   0.135        0.6770          0.2565          0.1415   
3   M   0.440     0.365   0.125        0.5160          0.2155          0.1140   
4   I   0.330     0.255   0.080        0.2050          0.0895          0.0395   

   Shell weight  Rings  
0         0.150     15  
1         0.070      7  
2         0.210      9  
3         0.155     10  
4         0.055      7   
 
              Length     Diameter       Height  Whole weight  Shucked weight  \
count  4177.000000  4177.000000  4177.000000   4177.000000     4177.000000   
mean      0.523992     0.407881     0.139516      0.828742        0.359367   
std       0.120093     0.099240     0.041827      0.490389        0.221963   
min       0.075000     0.055000     0.000000  

In [3]:
print(data["Sex"].unique())

['M' 'F' 'I']


In [4]:
data.loc[data["Sex"]=='M' , "Sex" ] = 0
data.loc[data["Sex"]=='F' , "Sex" ] = 1
data.loc[data["Sex"]=='I' , "Sex" ] = 2

In [5]:
data["AgeCategory"] = 0
data.loc[data['Rings'] <9, "AgeCategory"] = 1
data.loc[(data['Rings'] >= 9) & (data['Rings'] <= 10), "AgeCategory"] = 2
data.loc[data['Rings'] >10, "AgeCategory" ] = 3
print(data.groupby('AgeCategory').size(), '\n',
sum(data.groupby('AgeCategory').size()))

AgeCategory
1    1407
2    1323
3    1447
dtype: int64 
 4177


In [6]:
data.groupby('AgeCategory').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Diameter,Height,Length,Rings,Shell weight,Shucked weight,Viscera weight,Whole weight
AgeCategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,count,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0
1,mean,0.321276,0.106596,0.420991,6.884151,0.121394,0.198199,0.093357,0.432374
1,std,0.090292,0.04183,0.111375,1.216914,0.080965,0.147031,0.06809,0.306007
1,min,0.055,0.0,0.075,1.0,0.0015,0.001,0.0005,0.002
1,25%,0.26,0.085,0.35,6.0,0.06,0.08225,0.04,0.196
1,50%,0.325,0.105,0.43,7.0,0.1065,0.165,0.0785,0.3685
1,75%,0.385,0.125,0.5,8.0,0.16975,0.28225,0.12925,0.597
1,max,0.565,1.13,0.72,8.0,0.47,0.8255,0.3855,1.71
2,count,1323.0,1323.0,1323.0,1323.0,1323.0,1323.0,1323.0,1323.0
2,mean,0.436754,0.148171,0.56017,9.479214,0.258777,0.416345,0.204731,0.927122


In [7]:
data_use = data.drop('Rings',1)

In [8]:
X = data_use.drop('AgeCategory', 1)
Y = data_use["AgeCategory"].astype('category')

In [9]:
logistic = LogisticRegression()
KNN = KNeighborsClassifier()
RF = RandomForestClassifier()
LDA = LinearDiscriminantAnalysis()
ensembleHard = VotingClassifier(estimators = [('lr', logistic), ('rf', RF), ('knn', KNN), ('lda', LDA) ], voting = 'hard')
ensembleSoft = VotingClassifier(estimators = [('lr', logistic), ('rf', RF), ('knn', KNN), ('lda', LDA) ], voting = 'soft')

algorithms = [logistic, KNN, RF, LDA, ensembleHard, ensembleSoft]
names = ["Logistic Regression" , "K- Nearest Neighbors", "Random Forest", 
         "Linear Discriminant Analysis", "Hard Vote Ensemble", "Soft Vote Ensemble"]


In [10]:
for i in range(len(algorithms)):
    algorithms[i].fit(X,Y)
    temp_predict = algorithms[i].predict(X)
    print(names[i], "\n" , metrics.classification_report(Y,temp_predict), 
          "\n \n", metrics.confusion_matrix(Y, temp_predict))
    temp_cv = cross_val_score(algorithms[i], X, Y, cv = 8)
    print("\n Cross Validation score is %0.3f \n \n" % (temp_cv.mean()))

Logistic Regression 
              precision    recall  f1-score   support

          1       0.67      0.79      0.73      1407
          2       0.51      0.37      0.43      1323
          3       0.66      0.71      0.68      1447

avg / total       0.62      0.63      0.62      4177
 
 
 [[1115  231   61]
 [ 361  486  476]
 [ 186  231 1030]]

 Cross Validation score is 0.622 
 

K- Nearest Neighbors 
              precision    recall  f1-score   support

          1       0.78      0.85      0.81      1407
          2       0.64      0.66      0.65      1323
          3       0.80      0.71      0.76      1447

avg / total       0.74      0.74      0.74      4177
 
 
 [[1197  180   30]
 [ 226  872  225]
 [ 111  303 1033]]

 Cross Validation score is 0.610 
 

Random Forest 
              precision    recall  f1-score   support

          1       0.98      1.00      0.99      1407
          2       0.98      0.97      0.98      1323
          3       0.99      0.98      0.98      1