In [17]:
# Import the libraries
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss

In [2]:
# Load in the data
df = pd.read_csv('../Datasets/Yeast/yeast.csv')

# Have a look at the first 10 rows
df.head(5)

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.004168,-0.170975,-0.156748,-0.142151,0.058781,0.026851,0.197719,0.04185,0.066938,-0.056617,...,False,False,True,True,False,False,False,True,True,False
1,-0.103956,0.011879,-0.098986,-0.054501,-0.00797,0.049113,-0.03058,-0.077933,-0.080529,-0.016267,...,False,False,False,False,False,False,False,False,False,False
2,0.509949,0.401709,0.293799,0.087714,0.011686,-0.006411,-0.006255,0.013646,-0.040666,-0.024447,...,False,False,False,False,False,False,False,True,True,False
3,0.119092,0.004412,-0.002262,0.072254,0.044512,-0.051467,0.074686,-0.00767,0.079438,0.062184,...,False,False,False,False,False,False,False,False,False,False
4,0.042037,0.007054,-0.069483,0.081015,-0.048207,0.089446,-0.004947,0.064456,-0.133387,0.068878,...,True,True,False,False,False,False,False,False,False,False


In [3]:
# Have a look at the data shape
df.shape

(2417, 117)

In [4]:
# Convert TRUE and FALSE to 1 and 0
for colname in df.loc[:, 'Class1':'Class14'].columns:
    df[colname] = df[colname].astype(int)

# Have a look at the first 5 rows
df.head(5)

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.004168,-0.170975,-0.156748,-0.142151,0.058781,0.026851,0.197719,0.04185,0.066938,-0.056617,...,0,0,1,1,0,0,0,1,1,0
1,-0.103956,0.011879,-0.098986,-0.054501,-0.00797,0.049113,-0.03058,-0.077933,-0.080529,-0.016267,...,0,0,0,0,0,0,0,0,0,0
2,0.509949,0.401709,0.293799,0.087714,0.011686,-0.006411,-0.006255,0.013646,-0.040666,-0.024447,...,0,0,0,0,0,0,0,1,1,0
3,0.119092,0.004412,-0.002262,0.072254,0.044512,-0.051467,0.074686,-0.00767,0.079438,0.062184,...,0,0,0,0,0,0,0,0,0,0
4,0.042037,0.007054,-0.069483,0.081015,-0.048207,0.089446,-0.004947,0.064456,-0.133387,0.068878,...,1,1,0,0,0,0,0,0,0,0


In [5]:
# See how many of each category exists
for colname in df.loc[:, 'Class1':'Class14'].columns:
    count = df[colname].sum()
    print(f'{colname}: {count}')

Class1: 762
Class2: 1038
Class3: 983
Class4: 862
Class5: 722
Class6: 597
Class7: 428
Class8: 480
Class9: 178
Class10: 253
Class11: 289
Class12: 1816
Class13: 1799
Class14: 34


Only Class 14 is assigned to very few data points. Every other class is assigned to hundreds of data points. 

In [6]:
# See the distribution of categories per data point
df_y = df.loc[:, 'Class1':'Class14']
rowsums = df_y.sum(axis=1)
rowsums.hist();

The average seems to be about 4-5 categories per data point.

### Prepare the data

In [7]:
# Get the independent variables
df_x = df.loc[:, 'Att1':'Att103']

In [8]:
# Encode the labels
encoded_y = df_y.values

# Have a look at the first 4 rows
encoded_y[0:5]

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])

In [9]:
# Make sure the shape was retained
encoded_y.shape

(2417, 14)

In [10]:
# Create the train-test split
x_train, x_test, y_train, y_test = train_test_split(df_x, encoded_y, test_size=0.25, random_state=1000)

### Try out a bunch of models

In [18]:
# Try out a bunch of models
# mnb = MultinomialNB()
sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6, tol=None)
lr = LogisticRegression(solver='lbfgs')
knn = KNeighborsClassifier(n_neighbors=5)
rf = RandomForestClassifier(n_estimators = 100)

In [19]:
# Create function to print out model performance
def print_performance(y_pred, y_test, classifier):
    print(f'Classifier: , {classifier.__class__.__name__}')
    print(f'Hamming loss: {hamming_loss(y_pred, y_test)}')
    print('---')

In [20]:
# Create the models and see their performance
# for classifier in [mnb, sgd, lr]:
for classifier in [sgd, lr, knn, rf]:
    model = OneVsRestClassifier(classifier)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print_performance(y_pred, y_test, classifier)

Classifier: , SGDClassifier
Hamming loss: 0.20436835891381347
---
Classifier: , LogisticRegression
Hamming loss: 0.19846517119244392
---
Classifier: , KNeighborsClassifier
Hamming loss: 0.20979929161747343
---
Classifier: , RandomForestClassifier
Hamming loss: 0.19149940968122786
---
