# Assignment 6
1. Use yeast dataset from UCI http://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data
2. Remove the first column and use the last column as the target
3. Only leave CYT and VAC classes
4. Replace [0.3, 0.5, 0.7] in feature 2 to null
5. Replace [0.26, 0.36, 0.64] in feature 3 to null
6. Split the data
7. Impute the data (or not, it's your call)
8. Build a outlier detection model to classify VAC from CYT, i.e. 0 from 1
9. Build a classifer using sample augmentation techniques to flassify VAC from CYT, i.e. 0 from 1
10. Try different methods and hyper paramters

11. Report perfromance using F-1 score

In [151]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Import libraries
%matplotlib inline

plt.rcParams["figure.figsize"] = [16, 9]
nan = np.nan

In [140]:
# Load dataset
url = "https://people.arcada.fi/~martinel/"
names = ['Sequence Name','mcg', 'gvh', 'alm', 'mit', 'erl','pox','vac','nuc', 'label']
df = pd.read_csv(url + 'yeast.data', header=None, names=names, delim_whitespace=True)
df

Unnamed: 0,Sequence Name,mcg,gvh,alm,mit,erl,pox,vac,nuc,label
0,ADT1_YEAST,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,ADT2_YEAST,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,ADT3_YEAST,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,AAR2_YEAST,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,AATM_YEAST,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT
...,...,...,...,...,...,...,...,...,...,...
1479,YUR1_YEAST,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,ME2
1480,ZIP1_YEAST,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,NUC
1481,ZNRP_YEAST,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,ME2
1482,ZUO1_YEAST,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,NUC


In [141]:
# Only leave CYT and VAC classes
df = (df.loc[(df['label'] == 'CYT') | (df['label'] == 'VAC')])
df.label.unique()

array(['CYT', 'VAC'], dtype=object)

In [142]:
# Replace [0.3, 0.5, 0.7] in feature 2 to null
df = df.replace({'gvh' : { 0.3:nan, 0.5:nan, 0.7:nan }})

# Replace [0.26, 0.36, 0.64] in feature 3 to null
df = df.replace({'alm' : { 0.26:nan, 0.36:nan, 0.64:nan }})

In [146]:
df.isna().sum()

Sequence Name     0
mcg               0
gvh              26
alm               5
mit               0
erl               0
pox               0
vac               0
nuc               0
label             0
dtype: int64

In [147]:
df

Unnamed: 0,Sequence Name,mcg,gvh,alm,mit,erl,pox,vac,nuc,label
5,AATC_YEAST,0.51,0.40,0.56,0.17,0.5,0.5,0.49,0.22,CYT
9,ABP1_YEAST,0.40,0.39,0.60,0.15,0.5,0.0,0.58,0.30,CYT
12,ACH1_YEAST,0.40,0.42,0.57,0.35,0.5,0.0,0.53,0.25,CYT
15,ACT_YEAST,0.46,0.44,0.52,0.11,0.5,0.0,0.50,0.22,CYT
16,ACT2_YEAST,0.47,0.39,0.50,0.11,0.5,0.0,0.49,0.40,CYT
...,...,...,...,...,...,...,...,...,...,...
1475,YP53_YEAST,0.71,,0.50,0.18,0.5,0.0,0.46,0.22,CYT
1476,YPT7_YEAST,0.61,0.48,0.54,0.25,0.5,0.0,0.50,0.22,CYT
1477,R29A_YEAST,0.38,0.32,,0.41,0.5,0.0,0.44,0.11,CYT
1478,R29B_YEAST,0.38,0.40,0.66,0.35,0.5,0.0,0.43,0.11,CYT


In [148]:
# Remove the first column and use the last column as the target
X = df.iloc[: ,1:9]
y = df.iloc[:,-1]


In [152]:
# 6. Split the data
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 7. Impute the data (or not, it's your call)
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(X)

array([[0.51, 0.4 , 0.56, ..., 0.5 , 0.49, 0.22],
       [0.4 , 0.39, 0.6 , ..., 0.  , 0.58, 0.3 ],
       [0.4 , 0.42, 0.57, ..., 0.  , 0.53, 0.25],
       ...,
       [0.38, 0.32, 0.6 , ..., 0.  , 0.44, 0.11],
       [0.38, 0.4 , 0.66, ..., 0.  , 0.43, 0.11],
       [0.65, 0.54, 0.54, ..., 0.  , 0.53, 0.22]])