In [49]:
import numpy as np
import pandas as  pd 
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.manifold import TSNE

In [2]:
df = pd.read_table("yeast.data" , header = None,sep='\s+', names= 
    ["SequenceName" ,"mcg", "gvh", "alm","mit",
                    "erl","pox","vac","nuc", "class"]) #Original 10 cols with seqname
df = df.drop(['SequenceName'], axis = 1) 

In [3]:
df_values = df.values #(1484, 9) "mcg", "gvh", "alm","mit","erl","pox","vac","nuc", "class"

In [27]:
#Classes that we are trying to predict. Give each of them a Value 
classes = ["CYT", "NUC", "MIT", "ME3", "ME2", "ME1","EXC", "VAC", "POX", "ERL"]
dictClasses = {}
i = 1
for name in classes:
    dictClasses[name] = i
    i = i + 1
dictClasses

{'CYT': 1,
 'NUC': 2,
 'MIT': 3,
 'ME3': 4,
 'ME2': 5,
 'ME1': 6,
 'EXC': 7,
 'VAC': 8,
 'POX': 9,
 'ERL': 10}

In [28]:
classList = df_values[:,8] #Get every row of class column
classList = classList.reshape(1484,1) #(1484, 1)
classListLabeled = np.full((1484,1), -1)
for i in range(1484):
    classListLabeled[i] = dictClasses[classList[i,0]]
#classListLabeled (1484, 1)
df["classNum"] = classListLabeled
df_final = df.drop(['class'], axis=1)


In [29]:
total = df_final.values
np.random.shuffle(total)

In [58]:
X1 = total[:,0:8]
Y = total[:,8]


In [32]:
#Isolation forest - only pass in 8 features
#
rng = np.random.RandomState(42)
clf = IsolationForest(behaviour='new', max_samples=10,
                      random_state=rng, contamination=.10, max_features=8)
clf.fit(X1)

IsolationForest(behaviour='new', bootstrap=False, contamination=0.1,
                max_features=8, max_samples=10, n_estimators=100, n_jobs=None,
                random_state=RandomState(MT19937) at 0x11D22C7C0, verbose=0,
                warm_start=False)

In [33]:
#1 if normal, -1 if Outlier
y_pred_outliers = clf.predict(X1)
y_pred_outliers = np.reshape(y_pred_outliers, (y_pred_outliers.size,1))
df["outlier"] = y_pred_outliers
df["outlier"].value_counts() 


 1    1335
-1     149
Name: outlier, dtype: int64

In [34]:
# Local Outlier Factor - only pass in 8 features
clf2 = LocalOutlierFactor(n_neighbors=5, contamination=.10)
clf.fit(X1)
y_pred_outliers2 = clf.predict(X1)
y_pred_outliers2 = np.reshape(y_pred_outliers2, (y_pred_outliers2.size,1))
df["outlier2"] = y_pred_outliers2
df["outlier2"].value_counts() 


 1    1335
-1     149
Name: outlier2, dtype: int64

In [36]:
#NOT needed - Analyzes both methods combined
df["outComb"] = df.outlier + df.outlier2
df["outComb"].value_counts()

 2    1290
-2     104
 0      90
Name: outComb, dtype: int64

In [37]:
df["class"].value_counts()

CYT    463
NUC    429
MIT    244
ME3    163
ME2     51
ME1     44
EXC     35
VAC     30
POX     20
ERL      5
Name: class, dtype: int64

In [38]:
#USED IOSLATION FOREST
out_df = df[df.outlier <= 0]
norm_df = df[df.outlier > 0]

In [39]:
#USe for next Part
norm_df["class"].value_counts()


CYT    412
NUC    386
MIT    222
ME3    150
ME2     48
ME1     39
EXC     30
VAC     26
POX     18
ERL      4
Name: class, dtype: int64

In [40]:
out_df["class"].value_counts()

CYT    51
NUC    43
MIT    22
ME3    13
ME1     5
EXC     5
VAC     4
ME2     3
POX     2
ERL     1
Name: class, dtype: int64

In [62]:
df


Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,class,classNum,outlier,outlier2,outComb
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT,3,-1,-1,-2
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT,3,1,-1,0
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT,3,1,1,2
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC,2,1,1,2
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1479,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,ME2,5,1,1,2
1480,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,NUC,2,1,1,2
1481,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,ME2,5,1,1,2
1482,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,NUC,2,1,1,2
