# 1.a Data process

In [20]:
import numpy as np
from sklearn.linear_model import (LinearRegression, Ridge,Lasso)
from sklearn.metrics import mean_squared_error
from sklearn import cross_validation,metrics
label_val={'Bufonidae':0,'Dendrobatidae':1,'Hylidae':2,'Leptodactylidae':3,
     'Adenomera':0,'Ameerega':1,'Dendropsophus':2,'Hypsiboas':3,'Leptodactylus':4,'Osteocephalus':5,'Rhinella':6,'Scinax':7,
     'AdenomeraAndre':0,'AdenomeraHylaedactylus':1,'Ameeregatrivittata':2,'HylaMinuta':3,'HypsiboasCinerascens':4,'HypsiboasCordobae':5,'LeptodactylusFuscus':6,'OsteocephalusOophagus':7,'Rhinellagranulosa':8,'ScinaxRuber':9}
def data_process(path):
    raw_data=[]
    raw_label=[]
    with open(path, encoding='utf8') as file:
        file.readline()
        for line in file:
            data=[]
            for item in line.strip().split(','):
                data.append(item)
            raw_data.append([float(item) for item in data[:-4]])
            raw_label.append([label_val[item] for item in data[-4:-1]])
    train_data, test_data, train_label, test_label = cross_validation.train_test_split(
    raw_data, raw_label, test_size=0.3, random_state=1)
    return train_data,train_label,test_data,test_label
data,label,test_data,test_label=data_process('./data.csv')
label_fam,label_gen,label_spe=[],[],[]
for i in range(len(label)):
    label_fam.append(label[i][0])
    label_gen.append(label[i][1])
    label_spe.append(label[i][2])
print('Data process done')

Data process done


# 1.b.i Hamming Loss

In [50]:
from sklearn.metrics import hamming_loss
def hamming(clf,label_name):
    if label_name=='Family':
        flag=0
    elif label_name=='Genus':
        flag=1
    else:
        flag=2
    clf.fit(data,np.array(label)[:,flag])
    predictions=clf.predict(test_data)
    print('hamming_loss is :',hamming_loss(np.array(test_label)[:,flag], predictions))

# 1.b.ii SVM

In [42]:
from sklearn import multiclass
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import svm
def cv_parameter(model_name,data_train,label_train):
    from sklearn.grid_search import GridSearchCV
    if model_name=='SVM':
        parameters = {'C': [i for i in range(1,10)],'gamma': [0.001, 0.01, 0.1,1,3,5,7,10]}
        gsearch1 = GridSearchCV(estimator=svm.SVC(kernel='rbf',decision_function_shape='ovr'),
                            param_grid=parameters, cv=10)
        gsearch1.fit(data_train, label_train)
        #print(gsearch1.grid_scores_,gsearch1.best_params_, gsearch1.best_score_)
        param=[]
        for key in gsearch1.best_params_.keys():
            param.append(gsearch1.best_params_[key])
        return param
def svm_model(data_train,label_train,label_name):
    param=cv_parameter('SVM',data_train,label_train)
    clf = svm.SVC(kernel='rbf', C=param[0],gamma=param[1],decision_function_shape='ovr')
    print('The parameter for label {} is: C= {},gamma= {}'.format(label_name,param[0],param[1]))
    hamming(clf,label_name)
    return clf
svm_fam=svm_model(data,label_fam,'Family')
svm_gen=svm_model(data,label_gen,'Genus')
svm_spe=svm_model(data,label_spe,'Species')
print('SVM model is done')

The parameter for label Family is: C= 6,gamma= 3
hamming_loss is : 0.008800370541917554
The parameter for label Genus is: C= 5,gamma= 3
hamming_loss is : 0.0111162575266327
The parameter for label Species is: C= 7,gamma= 3
hamming_loss is : 0.01157943492357573
SVM model is done


As for solving the problem with both standardized and raw data, the data are already normalized.

# 1.b.iii L1-penalized SVM

In [43]:
from sklearn import svm
def cv_parameter(model_name,data_train,label_train):
    from sklearn.grid_search import GridSearchCV
    if model_name=='SVM':
        parameters = {'C': [i for i in range(1,10)]}
        gsearch1 = GridSearchCV(estimator=svm.LinearSVC(penalty='l1',multi_class='ovr',dual=False),
                            param_grid=parameters, cv=10)
        gsearch1.fit(data_train, label_train)
        param=[]
        for key in gsearch1.best_params_.keys():
            param.append(gsearch1.best_params_[key])
        return param
def svm_model(data_train,label_train,label_name):
    param=cv_parameter('SVM',data_train,label_train)
    clf = svm.LinearSVC(C=param[0],penalty='l1',multi_class='ovr',dual=False)
    print('The parameter for label {} is: C= {}'.format(label_name,param[0]))
    hamming(clf,label_name)
    return clf
svm_fam_l1=svm_model(data,label_fam,'Family')
svm_gen_l1=svm_model(data,label_gen,'Genus')
svm_spe_l1=svm_model(data,label_spe,'Species')
print('Linear SVM model is done')

The parameter for label Family is: C= 5
hamming_loss is : 0.06484483557202408
The parameter for label Genus is: C= 7
hamming_loss is : 0.0555812876331635
The parameter for label Species is: C= 3
hamming_loss is : 0.04353867531264474
Linear SVM model is done


# 1.b.iv SVM with Smote

In [44]:
from imblearn.over_sampling import SMOTE
from collections import Counter
def Smote_process(data_train,label_train,label_name):
    sm = SMOTE(random_state=1113)
    x_new, y_new = sm.fit_resample(data_train, label_train)
    print('Resampled dataset shape %s' % Counter(y_new))
    data = np.array(x_new)
    mark = np.array([item for item in y_new])
    return data,mark
data_fam_new,label_fam_new=Smote_process(data,label_fam,'Family')
data_gen_new,label_gen_new=Smote_process(data,label_gen,'Genus')
data_spe_new,label_spe_new=Smote_process(data,label_spe,'Species')

from sklearn import svm
def cv_parameter(model_name,data_train,label_train):
    from sklearn.grid_search import GridSearchCV
    if model_name=='SVM':
        parameters = {'C': [i for i in range(1,10)]}
        gsearch1 = GridSearchCV(estimator=svm.LinearSVC(penalty='l1',multi_class='ovr',dual=False),
                            param_grid=parameters, cv=10)
        gsearch1.fit(data_train, label_train)
        param=[]
        for key in gsearch1.best_params_.keys():
            param.append(gsearch1.best_params_[key])
        return param
def svm_model(data_train,label_train,label_name):
    param=cv_parameter('SVM',data_train,label_train)
    clf = svm.LinearSVC(C=param[0],penalty='l1',multi_class='ovr',dual=False)
    print('The parameter for label {} is: C= {}'.format(label_name,param[0]))
    hamming(clf,label_name)
    return clf
svm_fam_smote=svm_model(data_fam_new,label_fam_new,'Family')
svm_gen_smote=svm_model(data_gen_new,label_gen_new,'Genus')
svm_spe_smote=svm_model(data_spe_new,label_spe_new,'Species')
print('Linear SVM model with Smote is done')

Resampled dataset shape Counter({3: 3092, 2: 3092, 0: 3092, 1: 3092})
Resampled dataset shape Counter({0: 2891, 3: 2891, 6: 2891, 4: 2891, 1: 2891, 2: 2891, 7: 2891, 5: 2891})
Resampled dataset shape Counter({1: 2404, 0: 2404, 4: 2404, 8: 2404, 6: 2404, 2: 2404, 3: 2404, 9: 2404, 5: 2404, 7: 2404})
The parameter for label Family is: C= 5
hamming_loss is : 0.06577119036591014
The parameter for label Genus is: C= 6
hamming_loss is : 0.05511811023622047
The parameter for label Species is: C= 4
hamming_loss is : 0.0444650301065308
Linear SVM model with Smote is done


# 1.b.v Classify chain

In [55]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn import svm
from sklearn import preprocessing
clf_chain = ClassifierChain(svm.SVC(kernel='rbf', C=6,gamma=3,decision_function_shape='ovr'))
clf_chain.fit(np.array(data), np.array(label))

predictions=clf_chain.predict(test_data)
#print(np.array(test_label),predictions.A)
for i in range(3):
    label_name=['Family','Genus','Species']
    acc=metrics.precision_score(np.array(test_label)[:,i],predictions.A[:,i],average='macro')
    recall=metrics.recall_score(np.array(test_label)[:,i],predictions.A[:,i],average='macro')

    confusion=metrics.confusion_matrix(np.array(test_label)[:,i],predictions.A[:,i])

    lb = preprocessing.LabelBinarizer()
    lb.fit(np.array(test_label)[:,i])
    y_test = lb.transform(np.array(test_label)[:,i])
    y_pred = lb.transform(predictions.A[:,i])
    auc=metrics.roc_auc_score(y_test,y_pred)
    print('{} for label {}, the acc is {}, recall is {}, confusion is {}, and auc is {}'.format('ClassifierChain',label_name[i],acc,recall,confusion,auc))
    print('hamming_loss is :',hamming_loss(np.array(test_label)[:,i], predictions.A[:,i]))

ClassifierChain for label Family, the acc is 0.9776657929799415, recall is 0.9337073905377608, confusion is [[  15    0    2    3]
 [   0  155    0    0]
 [   1    1  650    4]
 [   0    0    8 1320]], and auc is 0.9648482632482442
hamming_loss is : 0.008800370541917554
ClassifierChain for label Genus, the acc is 0.9675390222347514, recall is 0.9364488348387168, confusion is [[1253    0    2    3    0    1    0    0]
 [   0  155    0    0    0    0    0    0]
 [   4    1   96    0    0    0    0    0]
 [   0    0    0  479    1    1    0    1]
 [   0    0    0    2   67    0    0    0]
 [   1    0    0    3    0   30    0    0]
 [   0    0    0    2    3    0   15    0]
 [   1    0    0    0    0    0    1   37]], and auc is 0.9671059940729121
hamming_loss is : 0.012505789717461788
ClassifierChain for label Species, the acc is 0.9708288003198039, recall is 0.9466171189480017, confusion is [[ 182    0    0    1    0    1    0    1    0    0]
 [   0 1071    0    1    0    2    0    0    

# 1.b.vi Index compute

In [232]:
from sklearn.preprocessing import LabelBinarizer
def compute_metrics(clf,model_name,label_name):
    if label_name=='Family':
        flag=0
    elif label_name=='Genus':
        flag=1
    else:
        flag=2
    label_tmp=np.array(label)[:,flag]
    test_label_tmp=np.array(test_label)[:,flag]
    clf.fit(np.array(data), label_tmp)
    predictions=clf.predict(test_data)
    #print(test_label_tmp,predictions)
    acc=metrics.precision_score(test_label_tmp,predictions,average='macro')
    recall=metrics.recall_score(test_label_tmp,predictions,average='macro')
    lb = LabelBinarizer()
    lb.fit(label_tmp)
    y_test = lb.transform(test_label_tmp)
    y_pred = lb.transform(predictions)
    confusion=metrics.confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1))
    auc=metrics.roc_auc_score(y_test,y_pred)
    #confusion=metrics.confusion_matrix(np.array(test_label)[:,flag],predictions)
    #auc=metrics.roc_auc_score(np.array(test_label)[:,flag],predictions,average='macro')
    print('{} for label {}, the acc is {}, recall is {}, confusion is {}, and auc is {}'.format(model_name,label_name,acc,recall,confusion,auc))
compute_metrics(svm_fam,'Svm','Family')
compute_metrics(svm_gen,'Svm','Genus')
compute_metrics(svm_spe,'Svm','Species')
compute_metrics(svm_fam_l1,'Svm_l1','Family')
compute_metrics(svm_gen_l1,'Svm_l1','Genus')
compute_metrics(svm_spe_l1,'Svm_l1','Species')
compute_metrics(svm_fam_smote,'Svm_smote','Family')
compute_metrics(svm_gen_smote,'Svm_smote','Genus')
compute_metrics(svm_spe_smote,'Svm_smote','Species')


Svm for label Family, the acc is 0.9776657929799415, recall is 0.9337073905377608, confusion is [[  15    0    2    3]
 [   0  155    0    0]
 [   1    1  650    4]
 [   0    0    8 1320]], and auc is 0.9648482632482442
Svm for label Genus, the acc is 0.9801958860799931, recall is 0.9409908719531632, confusion is [[1254    0    2    3    0    0    0    0]
 [   0  155    0    0    0    0    0    0]
 [   4    0   97    0    0    0    0    0]
 [   0    0    0  479    1    1    0    1]
 [   1    0    0    1   67    0    0    0]
 [   0    0    0    4    0   30    0    0]
 [   1    0    0    1    3    0   15    0]
 [   1    0    0    0    0    0    0   38]], and auc is 0.969434655769102
Svm for label Species, the acc is 0.9794873295859711, recall is 0.9501713205220053, confusion is [[ 182    0    0    1    0    2    0    0    0    0]
 [   0 1071    0    1    0    2    0    0    0    0]
 [   0    0  155    0    0    0    0    0    0    0]
 [   0    2    1   97    0    1    0    0    0    0]
 

The evaluation methods are divided into two categories: one is based on measurement of labels; the other is based on measurement on samples. In this work, I am going to use metrics based on label: calculate the index on each label just like multiple categories.
Conclusion: According to the data above, we can see the SVM with L2 gets the best perfomance, which is better than SVM with L1 and SVM with Smote. Besides, when we use ClassifierChain, it also has best performance, which is similiar to SVM with L1.