<h2>Machine Learning Assignment for Yeast Classification</h2>

In [5]:
#importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import *
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score  

In [6]:
#importing the dataset
df = pd.read_csv("yeast.csv")

In [7]:
df.head()

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,name
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT


In [8]:
df.tail()

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,name
1479,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,ME2
1480,0.47,0.43,0.61,0.4,0.5,0.0,0.48,0.47,NUC
1481,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,ME2
1482,0.43,0.4,0.6,0.16,0.5,0.0,0.53,0.39,NUC
1483,0.65,0.54,0.54,0.13,0.5,0.0,0.53,0.22,CYT


df.columns

In [9]:
df.shape

(1484, 9)

In [10]:
#checking for null values in any column
df.isnull().sum()

mcg     0
gvh     0
alm     0
mit     0
erl     0
pox     0
vac     0
nuc     0
name    0
dtype: int64

In [11]:
df.describe()

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc
count,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0
mean,0.500121,0.499933,0.500034,0.261186,0.504717,0.0075,0.499885,0.276199
std,0.137299,0.123924,0.08667,0.137098,0.048351,0.075683,0.057797,0.106491
min,0.11,0.13,0.21,0.0,0.5,0.0,0.0,0.0
25%,0.41,0.42,0.46,0.17,0.5,0.0,0.48,0.22
50%,0.49,0.49,0.51,0.22,0.5,0.0,0.51,0.22
75%,0.58,0.57,0.55,0.32,0.5,0.0,0.53,0.3
max,1.0,1.0,1.0,1.0,1.0,0.83,0.73,1.0


In [12]:
df['name'].unique()

array(['MIT', 'NUC', 'CYT', 'ME1', 'EXC', 'ME2', 'ME3', 'VAC', 'POX',
       'ERL'], dtype=object)

In [13]:
df['name'].value_counts()

CYT    463
NUC    429
MIT    244
ME3    163
ME2     51
ME1     44
EXC     35
VAC     30
POX     20
ERL      5
Name: name, dtype: int64

In [14]:
#commented as it is taking too much time
#sns.pairplot(df,hue = "name")
#for plotting the outliers for the column
#sns.boxplot(df['nuc'])

In [15]:
# Convert the "name" column into numerical labels using LabelEncoder
le = LabelEncoder()

In [16]:
df['name'] = le.fit_transform(df['name'])
df.head()

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,name
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,6
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,6
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,6
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,7
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,6


In [17]:
df['name'].value_counts()

0    463
7    429
6    244
5    163
4     51
3     44
2     35
9     30
8     20
1      5
Name: name, dtype: int64

In [18]:
X = df.drop(columns = ['name'])
y = df['name']
X[:8]

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22
5,0.51,0.4,0.56,0.17,0.5,0.5,0.49,0.22
6,0.5,0.54,0.48,0.65,0.5,0.0,0.53,0.22
7,0.48,0.45,0.59,0.2,0.5,0.0,0.58,0.34


In [19]:
y[8:]

8       6
9       0
10      7
11      7
12      0
       ..
1479    4
1480    7
1481    4
1482    7
1483    0
Name: name, Length: 1476, dtype: int32

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state=200)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(1187, 8) (297, 8)
(1187,) (297,)


In [21]:
model = LogisticRegression(multi_class='ovr')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
accuracy_score(y_test, y_predict)

0.5050505050505051

In [22]:
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.1,random_state=200)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.1)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(1335, 8) (149, 8)
(1335,) (149,)


In [23]:
model = LogisticRegression(multi_class='ovr')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
accuracy_score(y_test, y_predict)

0.5436241610738255

In [24]:
#With SVC Model accuracy Score
#from sklearn.svm import SVC
#model = SVC()
#model.fit(X_train, y_train)
#predict = model.predict(X_test)
#accuracy_score(y_test, predict)

In [25]:
#from sklearn.neighbors import KNeighborsClassifier
#model = KNeighborsClassifier()
#model.fit(X_train, y_train)
#predict = model.predict(X_test)
#accuracy_score(y_test, predict)

In [26]:
#from sklearn.naive_bayes import GaussianNB
#model = GaussianNB()
#model.fit(X_train, y_train)
#predict = model.predict(X_test)
#accuracy_score(y_test, predict)

In [27]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

In [28]:
#case1
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)
#case2
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.1)

pipeline = make_pipeline(StandardScaler(), LogisticRegression(multi_class='ovr'))
strtfdKFold = StratifiedKFold(n_splits=3)
kfold = strtfdKFold.split(X_train, y_train)
scores = []
for k, (train, test) in enumerate(kfold):
    pipeline.fit(X_train.iloc[train, :], y_train.iloc[train])
    score = pipeline.score(X_train.iloc[test, :], y_train.iloc[test])
    scores.append(score)
    print('Fold: %2d, Training/Test Split Distribution: %s, Accuracy: %.3f' % (k+1, np.bincount(y_train.iloc[train]), score))
    print('\nCross-Validation accuracy: %.3f +/- %.3f \n' %(np.mean(scores), np.std(scores)))    

Fold:  1, Training/Test Split Distribution: [278   3  21  28  33  95 149 254  11  18], Accuracy: 0.600

Cross-Validation accuracy: 0.600 +/- 0.000 

Fold:  2, Training/Test Split Distribution: [278   4  20  29  32  96 149 253  12  17], Accuracy: 0.598

Cross-Validation accuracy: 0.599 +/- 0.001 

Fold:  3, Training/Test Split Distribution: [278   3  21  29  33  95 150 253  11  17], Accuracy: 0.546

Cross-Validation accuracy: 0.581 +/- 0.025 

