# Assignment 6
1. Use yeast dataset from UCI http://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data
2. Remove the first column and use the last column as the target
3. Only leave CYT and VAC classes
4. Replace [0.3, 0.5, 0.7] in feature 2 to null
5. Replace [0.26, 0.36, 0.64] in feature 3 to null
6. Split the data

7. Impute the data (or not, it's your call)
8. Build a outlier detection model to classify VAC from CYT, i.e. 0 from 1
9. Build a classifer using sample augmentation techniques to flassify VAC from CYT, i.e. 0 from 1
10. Try different methods and hyper paramters

11. Report perfromance using F-1 score

### Use yeast dataset from UCI http://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


In [2]:
yeast_df = pd.read_csv("yeast.data", delim_whitespace=True, header=None, index_col=False, names=['Seq_Name', 'mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'nuc', 'label'])


In [3]:
yeast_df.head()

Unnamed: 0,Seq_Name,mcg,gvh,alm,mit,erl,pox,vac,nuc,label
0,ADT1_YEAST,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,ADT2_YEAST,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,ADT3_YEAST,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,AAR2_YEAST,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,AATM_YEAST,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT


### Remove the first column and use the last column as the target

In [4]:
yeast_df = yeast_df.drop(labels="Seq_Name", axis=1)

In [5]:
yeast_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1484 entries, 0 to 1483
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mcg     1484 non-null   float64
 1   gvh     1484 non-null   float64
 2   alm     1484 non-null   float64
 3   mit     1484 non-null   float64
 4   erl     1484 non-null   float64
 5   pox     1484 non-null   float64
 6   vac     1484 non-null   float64
 7   nuc     1484 non-null   float64
 8   label   1484 non-null   object 
dtypes: float64(8), object(1)
memory usage: 104.5+ KB


In [6]:
# descriptive analysis

yeast_df.describe().T 

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mcg,1484.0,0.500121,0.137299,0.11,0.41,0.49,0.58,1.0
gvh,1484.0,0.499933,0.123924,0.13,0.42,0.49,0.57,1.0
alm,1484.0,0.500034,0.08667,0.21,0.46,0.51,0.55,1.0
mit,1484.0,0.261186,0.137098,0.0,0.17,0.22,0.32,1.0
erl,1484.0,0.504717,0.048351,0.5,0.5,0.5,0.5,1.0
pox,1484.0,0.0075,0.075683,0.0,0.0,0.0,0.0,0.83
vac,1484.0,0.499885,0.057797,0.0,0.48,0.51,0.53,0.73
nuc,1484.0,0.276199,0.106491,0.0,0.22,0.22,0.3,1.0


### Only leave CYT and VAC classes 

In [7]:
# Filtering out only CYT and VAC classes in the label

yeastCYT = yeast_df[yeast_df['label'] == 'CYT']
yeastVAC = yeast_df[yeast_df['label'] == 'VAC']

yeast = pd.concat([yeastCYT, yeastVAC]).reset_index(drop=True)

In [8]:
#yeast[yeast.gvh == 0.3].count()

yeast.head(3)

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,label
0,0.51,0.4,0.56,0.17,0.5,0.5,0.49,0.22,CYT
1,0.4,0.39,0.6,0.15,0.5,0.0,0.58,0.3,CYT
2,0.4,0.42,0.57,0.35,0.5,0.0,0.53,0.25,CYT


### Replace [0.3, 0.5, 0.7] in feature 2 (mcg) to null or NaN

In [9]:
# mcg is feature 2 before we dropped feature 1
yeast['mcg'] = yeast['mcg'].replace(0.3, np.nan)
yeast['mcg'] = yeast['mcg'].replace(0.5, np.nan)
yeast['mcg'] = yeast['mcg'].replace(0.7, np.nan)

### Replace [0.26, 0.36, 0.64] in feature 3 (gvh) to null or NaN

In [10]:
# gvh is feature 3 before we dropped feature 1
yeast['gvh'] = yeast['gvh'].replace(0.26, np.nan)
yeast['gvh'] = yeast['gvh'].replace(0.36, np.nan)
yeast['gvh'] = yeast['gvh'].replace(0.64, np.nan)

### Split the data

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Lets also make our label column binary
targetReplace = {"label": {"CYT": 0, "VAC": 1}}
yeast.replace(targetReplace, inplace=True)



# we will have error during training if we do not impute data by eliminating the NaNs
# Imputing data
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
iyeast=pd.DataFrame(imp.fit_transform(yeast))


# Lets now split data
X = iyeast.drop(iyeast.iloc[:, [8]], axis=1)
y = iyeast.iloc[:, [8]]


# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)


# Scale data 
from sklearn.preprocessing import StandardScaler    
scaler = StandardScaler()
scaler.fit(X_train)
    
X_train = scaler.transform(X_train)                        
X_test = scaler.transform(X_test)


### Build a outlier detection model to classify VAC from CYT, i.e. 0 from 1

In [12]:
from sklearn.ensemble import IsolationForest

# identify outliers in the training dataset
iso = IsolationForest(n_estimators=100, contamination=.05) #n_estimators=100, contamination=.05
yhat = iso.fit_predict(X_train)

# select all rows that are not outliers
mask = yhat != -1
X_trainO, y_trainO = X_train[mask,:], y_train[mask]

# summarize the shape of the updated training dataset
print('shapes with outliers')
print(X_train.shape, y_train.shape)
print('')
print('shapes without outliers')
print(X_trainO.shape, y_trainO.shape)
print('')
print (str(X_train.shape[0]-X_trainO.shape[0]) + " Outliers have been removed")
print(" ")

# Training a model to predit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

model = LogisticRegression(random_state=0)
model.fit(X_trainO, y_trainO)

# evaluate the model
yhat = model.predict(X_test)

#calculate F1 score
print("The f1_score of the none urgmented data is: ")
f1_score(y_test, yhat)


shapes with outliers
(345, 8) (345, 1)

shapes without outliers
(327, 8) (327, 1)

18 Outliers have been removed
 
The f1_score of the none urgmented data is: 


0.375

### - Build a classifer using sample augmentation techniques to flassify VAC from CYT, i.e. 0 from 1

In [13]:
# The data was so imbalance
# See it:
print(yeastCYT.label.value_counts())
print("")
print(yeastVAC.label.value_counts())

CYT    463
Name: label, dtype: int64

VAC    30
Name: label, dtype: int64


In [17]:
# The data above is grossly imbalance
# Lets upsample the VAC class, it is the minority. 
from sklearn.utils import resample

yeastVAC_upsampled = resample(yeastVAC, 
                                 replace=True,     # sample with replacement
                                 n_samples=463,    # to match majority class ( CYT )
                                 random_state=123) # reproducible results


# Combine majority class with upsampled minority class
yeast_resampled = pd.concat([yeastCYT, yeastVAC_upsampled]).reset_index(drop=True)

print(yeast_resampled.label.value_counts()) 

CYT    463
VAC    463
Name: label, dtype: int64


In [18]:
# introducing the noise or NaN again

# introduce NaN
# mcg is feature 2 before we dropped feature 1
yeast_resampled['mcg'] = yeast_resampled['mcg'].replace(0.3, np.nan)
yeast_resampled['mcg'] = yeast_resampled['mcg'].replace(0.5, np.nan)
yeast_resampled['mcg'] = yeast_resampled['mcg'].replace(0.7, np.nan)


# gvh is feature 3 before we dropped feature 1
yeast_resampled['gvh'] = yeast_resampled['gvh'].replace(0.26, np.nan)
yeast_resampled['gvh'] = yeast_resampled['gvh'].replace(0.36, np.nan)
yeast_resampled['gvh'] = yeast_resampled['gvh'].replace(0.64, np.nan)



##### --------------- Process, imput and plit data -----------############

# Lets also make our label column binary
targetReplace = {"label": {"CYT": 0, "VAC": 1}}
yeast_resampled.replace(targetReplace, inplace=True)



# we will have error during training if we do not impute data by eliminating the NaNs
# Imputing data
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
iyeast=pd.DataFrame(imp.fit_transform(yeast_resampled))


# Lets now split data
X = iyeast.drop(iyeast.iloc[:, [8]], axis=1)
y = iyeast.iloc[:, [8]]


# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)


# Scale data    
scaler = StandardScaler()
scaler.fit(X_train)
    
X_train = scaler.transform(X_train)                        
X_test = scaler.transform(X_test)


In [19]:
# identify outliers in the training dataset
iso = IsolationForest(n_estimators=100, contamination=.05) #n_estimators=100, contamination=.05
yhat = iso.fit_predict(X_train)

# select all rows that are not outliers
mask = yhat != -1
X_trainO, y_trainO = X_train[mask,:], y_train[mask]

# summarize the shape of the updated training dataset
print('shapes with outliers')
print(X_train.shape, y_train.shape)
print('')
print('shapes without outliers')
print(X_trainO.shape, y_trainO.shape)
print('')
print (str(X_train.shape[0]-X_trainO.shape[0]) + " Outliers have been removed")
print(" ")

# Training a model to predit
model = LogisticRegression(random_state=0)
model.fit(X_trainO, y_trainO)

# evaluate the model
yhat = model.predict(X_test)

#calculate F1 score
print("The f1_score of the urgmented data is: ")
f1_score(y_test, yhat)


shapes with outliers
(648, 8) (648, 1)

shapes without outliers
(615, 8) (615, 1)

33 Outliers have been removed
 
The f1_score of the urgmented data is: 


0.7676056338028169

### - Try different methods and hyper paramters and Report perfromance using F-1 score again

Starting from the resampled and target column converted data
we will use a different method of imputing called KNNImputer


In [20]:
from sklearn.impute import KNNImputer

imp = KNNImputer(n_neighbors=2, weights="uniform")
iyeast=pd.DataFrame(imp.fit_transform(yeast_resampled))


# Lets now split data
X = iyeast.drop(iyeast.iloc[:, [8]], axis=1)
y = iyeast.iloc[:, [8]]


# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)


# Scale data    
scaler = StandardScaler()
scaler.fit(X_train)
    
X_train = scaler.transform(X_train)                        
X_test = scaler.transform(X_test)


In [21]:
# identify outliers in the training dataset
iso = IsolationForest(n_estimators=100, contamination=.05) #n_estimators=100, contamination=.05
yhat = iso.fit_predict(X_train)

# select all rows that are not outliers
mask = yhat != -1
X_trainO, y_trainO = X_train[mask,:], y_train[mask]

# summarize the shape of the updated training dataset
print('shapes with outliers')
print(X_train.shape, y_train.shape)
print('')
print('shapes without outliers')
print(X_trainO.shape, y_trainO.shape)
print('')
print (str(X_train.shape[0]-X_trainO.shape[0]) + " Outliers have been removed")
print(" ")


shapes with outliers
(648, 8) (648, 1)

shapes without outliers
(615, 8) (615, 1)

33 Outliers have been removed
 


In [22]:


##############################
##############################

# Training and optimizing model hyper-parameters a model to predit
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV



tuned_parameters = {'C': [0.1, 0.5, 1, 5, 10, 50, 100], 'penalty': ['l1', 'l2']}
clf = GridSearchCV(LogisticRegression(solver='liblinear',n_jobs=-1,random_state=0), tuned_parameters, cv=3, scoring="accuracy")
clf.fit(X_trainO, y_trainO)


# evaluate the model
yhat = clf.predict(X_test)

#calculate F1 score
print("The f1_score of the urgmented and hyper-parameter tuned data is: ")
f1_score(y_test, yhat)


The f1_score of the urgmented and hyper-parameter tuned data is: 


0.7758007117437722

### OBSERVATION

1. Without data urgmentation even though outliers were taken care of the f1_score was very poor at 37.5%
2. But after urgmentation the f1_score of the tuned model is slight better at 77.6% than that of the untuned at 76.8%