**Name: Muhan Li**

**USC ID: 5104027574**

# Multi-class and Multi-Label Classification Using Support Vector Machines

In [152]:
import pandas as pd
import numpy as np
from numpy import arange
from sklearn import svm
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import hamming_loss
from imblearn.over_sampling import SMOTE

## (a) Download the dataset

In [2]:
df0 = pd.read_csv('Frogs_MFCCs.csv')
df = df0.sample(frac=0.7)

## (b) Solve the multi-class and multi-label problem

### i. Research exact match and hamming score/loss methods

### ii. SVM

In [3]:
X_train = StandardScaler().fit_transform(df.iloc[:,range(22)])
y1_train = df.iloc[:,22]
y2_train = df.iloc[:,23]
y3_train = df.iloc[:,24]

#### 1st label: Families

In [4]:
print('log(C),log(gamma)∈(-2,2)')
print('Parameters for which the training accuracy is below 0.7:')
print('C\tgamma\taccuracy')
for i in [0.01,0.1,1,10,100]:
    for j in [0.01,0.1,1,10,100]:
        svc = svm.SVC(C=i, gamma=j, kernel='rbf',
                      decision_function_shape='ovr')
        svc.fit(X_train, y1_train)
        score = svc.score(X_train, y1_train)
        if score<0.7:
            print('%s\t%s\t%0.4f'
                  % (i, j, score))

log(C),log(gamma)∈(-2,2)
Parameters for which the training accuracy is below 0.7:
C	gamma	accuracy
0.01	1	0.6424
0.01	10	0.6166
0.01	100	0.6166
0.1	10	0.6211
0.1	100	0.6166


It seems that bigger C and smaller gamma may be better, so I add C=1000 and gamma=0.001 into cross validation, choosing C={1,10,100,1000} and gamma={0.001,0.01,0.1,1}:

In [5]:
svc = svm.SVC(kernel='rbf', decision_function_shape='ovr')
grid = GridSearchCV(svc, param_grid={"C":[1,10,100,1000],
                                     "gamma":[0.001,0.01,0.1,1]},
                    cv=10)
grid.fit(X_train, y1_train)
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 10, 'gamma': 0.1} with a score of 0.9897


#### 2nd label: Genus

In [6]:
print('log(C),log(gamma)∈(-2,2)')
print('Parameters for which the training accuracy is below 0.7:')
print('C\tgamma\taccuracy')
for i in [0.01,0.1,1,10,100]:
    for j in [0.01,0.1,1,10,100]:
        svc = svm.SVC(C=i, gamma=j, kernel='rbf',
                      decision_function_shape='ovr')
        svc.fit(X_train, y2_train)
        score = svc.score(X_train, y2_train)
        if score<0.7:
            print('%s\t%s\t%0.4f'
                  % (i, j, score))

log(C),log(gamma)∈(-2,2)
Parameters for which the training accuracy is below 0.7:
C	gamma	accuracy
0.01	1	0.6031
0.01	10	0.5780
0.01	100	0.5780
0.1	10	0.5826
0.1	100	0.5780


Similarly, choose C={1,10,100,1000} and gamma={0.001,0.01,0.1,1}:

In [8]:
svc = svm.SVC(kernel='rbf', decision_function_shape='ovr')
grid = GridSearchCV(svc, param_grid={"C":[1,10,100,1000],
                                     "gamma":[0.001,0.01,0.1,1]},
                    cv=10)
grid.fit(X_train, y2_train)
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 10, 'gamma': 0.01} with a score of 0.9869


#### 3rd lable: Species

In [9]:
print('log(C),log(gamma)∈(-2,2)')
print('Parameters for which the training accuracy is below 0.7:')
print('C\tgamma\taccuracy')
for i in [0.01,0.1,1,10,100]:
    for j in [0.01,0.1,1,10,100]:
        svc = svm.SVC(C=i, gamma=j, kernel='rbf',
                      decision_function_shape='ovr')
        svc.fit(X_train, y3_train)
        score = svc.score(X_train, y3_train)
        if score<0.7:
            print('%s\t%s\t%0.4f'
                  % (i, j, score))

log(C),log(gamma)∈(-2,2)
Parameters for which the training accuracy is below 0.7:
C	gamma	accuracy
0.01	1	0.5089
0.01	10	0.4843
0.01	100	0.4843
0.1	1	0.6402
0.1	10	0.4889
0.1	100	0.4843


Choose C={1,10,100,1000} and gamma={0.001,0.01,0.1,1} for cross validation:

In [10]:
svc = svm.SVC(kernel='rbf', decision_function_shape='ovr')
grid = GridSearchCV(svc, param_grid={"C":[1,10,100,1000],
                                     "gamma":[0.001,0.01,0.1,1]},
                    cv=10)
grid.fit(X_train, y3_train)
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 1000, 'gamma': 0.01} with a score of 0.9887


Try bigger C:

In [11]:
svc = svm.SVC(kernel='rbf', decision_function_shape='ovr')
grid = GridSearchCV(svc, param_grid={"C":[10**3,10**4,10**5],
                                     "gamma":[0.001,0.01,0.1,1]},
                    cv=10)
grid.fit(X_train, y3_train)
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 1000, 'gamma': 0.01} with a score of 0.9887


#### Evaluate the classifier using the test set

In [117]:
from numpy import setdiff1d
r=setdiff1d(df0.index, df.index)
test = pd.DataFrame(np.array(df0)[r])
X_test = StandardScaler().fit_transform(test.iloc[:,range(22)])
y1_test = test.iloc[:,22]
y2_test = test.iloc[:,23]
y3_test = test.iloc[:,24]

In [119]:
svc1 = svm.SVC(C=10, gamma=0.1, kernel='rbf', decision_function_shape='ovr')
svc1.fit(X_train, y1_train)
y1_pred = svc1.predict(X_test)

In [120]:
svc2 = svm.SVC(C=10, gamma=0.01, kernel='rbf', decision_function_shape='ovr')
svc2.fit(X_train, y2_train)
y2_pred = svc2.predict(X_test)

In [121]:
svc3 = svm.SVC(C=1000, gamma=0.01, kernel='rbf', decision_function_shape='ovr')
svc3.fit(X_train, y3_train)
y3_pred = svc3.predict(X_test)

In [122]:
y_pred = np.vstack((y1_pred,y2_pred,y3_pred)).T
y_test = np.vstack((y1_test,y2_test,y3_test)).T

##### Exact match

In [139]:
correct = 0
for i in range(2159):
    n=0
    for j in range(3):
        if y_test[i][j] == y_pred[i][j]:
            n=n+1
    if n==3:
        correct = correct + 1
score = correct/2159
score

0.9823992589161649

##### Hamming loss

In [127]:
loss_list = []
for i in range(2159):
    loss_list.append(hamming_loss(y_test[i],y_pred[i]))
loss = np.mean(loss_list)
1-loss

0.9905820595954917

### iii. $l_1$-penalized SVMs

#### 1st label: Families

In [78]:
svc = svm.LinearSVC(penalty='l1', dual=False)
grid = GridSearchCV(svc, param_grid={"C":10**np.arange(-3,7,dtype=float)}, cv=10)
grid.fit(X_train, y1_train)
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 10.0} with a score of 0.9335


#### 2nd label: Genus

In [77]:
svc = svm.LinearSVC(penalty='l1', dual=False)
grid = GridSearchCV(svc, param_grid={"C":10**np.arange(-3,7,dtype=float)}, cv=10)
grid.fit(X_train, y2_train)
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 10.0} with a score of 0.9506


#### 3rd label: Species

In [79]:
svc = svm.LinearSVC(penalty='l1', dual=False)
grid = GridSearchCV(svc, param_grid={"C":10**np.arange(-3,7,dtype=float)}, cv=10)
grid.fit(X_train, y3_train)
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 100.0} with a score of 0.9579


#### Evaluate the classifier using the test set

In [146]:
l_svc1 = svm.LinearSVC(penalty='l1', dual=False, C=10)
l_svc1.fit(X_train, y1_train)
y1_pred_l = l_svc1.predict(X_test)

In [147]:
l_svc2 = svm.LinearSVC(penalty='l1', dual=False, C=10)
l_svc2.fit(X_train, y2_train)
y2_pred_l = l_svc2.predict(X_test)

In [148]:
l_svc3 = svm.LinearSVC(penalty='l1', dual=False, C=100)
l_svc3.fit(X_train, y3_train)
y3_pred_l = l_svc3.predict(X_test)

In [149]:
y_pred_l = np.vstack((y1_pred_l,y2_pred_l,y3_pred_l)).T
y_test = np.vstack((y1_test,y2_test,y3_test)).T

##### Exact match

In [150]:
correct = 0
for i in range(2159):
    n=0
    for j in range(3):
        if y_test[i][j] == y_pred_l[i][j]:
            n=n+1
    if n==3:
        correct = correct + 1
score = correct/2159
score

0.9226493747105141

##### Hamming loss

In [151]:
loss_list = []
for i in range(2159):
    loss_list.append(hamming_loss(y_test[i],y_pred_l[i]))
loss = np.mean(loss_list)
1-loss

0.9512119808553342

### iv. Using SMOTE to remedy class imbalance

#### 1st label: Families

In [158]:
sm = SMOTE(random_state=42, kind='svm')
X1_res, y1_res = sm.fit_sample(X_train, y1_train)

In [160]:
svc = svm.LinearSVC(penalty='l1', dual=False)
grid = GridSearchCV(svc, param_grid={"C":10**np.arange(-3,7,dtype=float)}, cv=10)
grid.fit(X1_res, y1_res)
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 100000.0} with a score of 0.8956


#### 2nd label: Genus

In [161]:
sm = SMOTE(random_state=42, kind='svm')
X2_res, y2_res = sm.fit_sample(X_train, y2_train)

In [162]:
svc = svm.LinearSVC(penalty='l1', dual=False)
grid = GridSearchCV(svc, param_grid={"C":10**np.arange(-3,7,dtype=float)}, cv=10)
grid.fit(X2_res, y2_res)
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 10.0} with a score of 0.9042


#### 3rd label: Species

In [163]:
sm = SMOTE(random_state=42, kind='svm')
X3_res, y3_res = sm.fit_sample(X_train, y3_train)

In [164]:
svc = svm.LinearSVC(penalty='l1', dual=False)
grid = GridSearchCV(svc, param_grid={"C":10**np.arange(-3,7,dtype=float)}, cv=10)
grid.fit(X3_res, y3_res)
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 10.0} with a score of 0.8854


#### Evaluate the classifier using the test set

In [165]:
smote_svc1 = svm.LinearSVC(penalty='l1', dual=False, C=100000)
smote_svc1.fit(X1_res, y1_res)
y1_pred_smote = smote_svc1.predict(X_test)

In [166]:
smote_svc2 = svm.LinearSVC(penalty='l1', dual=False, C=10)
smote_svc2.fit(X2_res, y2_res)
y2_pred_smote = smote_svc2.predict(X_test)

In [167]:
smote_svc3 = svm.LinearSVC(penalty='l1', dual=False, C=10)
smote_svc3.fit(X3_res, y3_res)
y3_pred_smote = smote_svc3.predict(X_test)

In [168]:
y_pred_smote = np.vstack((y1_pred_smote,y2_pred_smote,y3_pred_smote)).T
y_test = np.vstack((y1_test,y2_test,y3_test)).T

##### Exact match

In [169]:
correct = 0
for i in range(2159):
    n=0
    for j in range(3):
        if y_test[i][j] == y_pred_smote[i][j]:
            n=n+1
    if n==3:
        correct = correct + 1
score = correct/2159
score

0.8198239925891616

##### Hamming loss

In [170]:
loss_list = []
for i in range(2159):
    loss_list.append(hamming_loss(y_test[i],y_pred_smote[i]))
loss = np.mean(loss_list)
1-loss

0.9059749884205651