# baseline

In [1]:
import pickle
import numpy as np

from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectPercentile


## Loading data

In [2]:
Training_data = dict()
Dev_data = dict()

for i in ['c0', 'c1', 'c2', 'c3', 'c4']:
    with open('./C2_TrainDev/processed_data/baseline/train/'+i+'_X.pkl', 'rb') as f:
        Training_data[i+'_X'] = pickle.load(f)

    with open('./C2_TrainDev/processed_data/baseline/train/'+i+'_y.pkl', 'rb') as f:
        Training_data[i+'_y'] = pickle.load(f)

    with open('./C2_TrainDev/processed_data/baseline/dev/'+i+'_X_dev.pkl', 'rb') as f:
        Dev_data[i+'_X_dev'] = pickle.load(f)

    with open('./C2_TrainDev/processed_data/baseline/dev/'+i+'_y_dev.pkl', 'rb') as f:
        Dev_data[i+'_y_dev'] = pickle.load(f)

## change dimension to fit svm

In [3]:
for data in ['c0_X','c1_X','c2_X','c3_X','c4_X']:
    list_ = Training_data[data]
    for i,j in enumerate(list_):
        list_[i] = np.squeeze(j)
    Training_data[data] = list_
        
for data in ['c0_X_dev','c1_X_dev','c2_X_dev','c3_X_dev','c4_X_dev']:   
    list_ = Dev_data[data]
    for i,j in enumerate(list_):
        list_[i] = np.squeeze(j)
    Dev_data[data] = list_

In [4]:
for data in ['c0_y','c1_y','c2_y','c3_y','c4_y']:
    list_ = Training_data[data]
    list_ = np.array(list_)
    Training_data[data] = list_
    
for data in ['c0_y_dev','c1_y_dev','c2_y_dev','c3_y_dev','c4_y_dev']:
    list_ = Dev_data[data]
    list_ = np.array(list_)
    Dev_data[data] = list_

## check the amount of data

In [87]:
print('train data\n')

for i in ['c0', 'c1', 'c2', 'c3', 'c4']:
    print('['+i+']')
    print('X:',len(Training_data[i+'_X']))
    print('y:',len(Training_data[i+'_y']))
    print('-'*20)

train data

[c0]
X: 4244
y: 4244
--------------------
[c1]
X: 838
y: 838
--------------------
[c2]
X: 22978
y: 22978
--------------------
[c3]
X: 22506
y: 22506
--------------------
[c4]
X: 1906
y: 1906
--------------------


In [86]:
print('Dev data\n')

for i in ['c0', 'c1', 'c2', 'c3', 'c4']:
    print('['+i+']')
    print('X:',len(Dev_data[i+'_X_dev']))
    print('y:',len(Dev_data[i+'_y_dev']))
    print('-'*20)

dev data

[c0]
X: 3681
y: 3681
--------------------
[c1]
X: 3681
y: 3681
--------------------
[c2]
X: 3681
y: 3681
--------------------
[c3]
X: 3681
y: 3681
--------------------
[c4]
X: 3681
y: 3681
--------------------


## SVM

Definding svm model

In [77]:
clf_0 = Pipeline([
    ('anova', SelectPercentile(percentile=50)),
    ('scaler', StandardScaler()),
    ('svc',SVC(C=1.0, kernel='linear', class_weight='balanced'))
])
#===================================================================
clf_1 = Pipeline([
    ('anova', SelectPercentile(percentile=10)),
    ('scaler', StandardScaler()),
    ('svc',SVC(C=0.1, kernel='linear', class_weight='balanced'))
])
#===================================================================
clf_2 = Pipeline([
    ('anova', SelectPercentile(percentile=20)),
    ('scaler', StandardScaler()),
    ('svc',SVC(C=0.1, kernel='linear', class_weight='balanced'))
])
#===================================================================
clf_3 = Pipeline([
    ('anova', SelectPercentile(percentile=10)),
    ('scaler', StandardScaler()),
    ('svc',SVC(C=0.1, kernel='linear', class_weight='balanced'))
])
#===================================================================
clf_4 = Pipeline([
    ('anova', SelectPercentile(percentile=20)),
    ('scaler', StandardScaler()),
    ('svc',SVC(C=1.0, kernel='linear', class_weight='balanced'))
])

calc metrics

In [38]:
# 計算metric

def perf_measure(y_actual, y_pred):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_pred)): 
        if y_actual[i]==y_pred[i]==1:
           TP += 1
        if y_pred[i]==1 and y_actual[i]!=y_pred[i]:
           FP += 1
        if y_actual[i]==y_pred[i]==0:
           TN += 1
        if y_pred[i]==0 and y_actual[i]!=y_pred[i]:
           FN += 1
    
    precision = TP / (TP+FP)
    recall = TP / (TP+FN)
    f1 = 2 * precision * recall / (precision + recall)  
    
    print('TP:',TP)
    print('FP:',FP)
    print('TN:',TN)
    print('FN:',FN)
    print('total:',TP+FP+TN+FN)
    print()
    print('Acc:',(TP+TN)/(TP+FP+TN+FN))
    print('Precision:',precision)
    print('Recall:',recall)
    print('F1:',f1)
    
    return (TP, FP, TN, FN, precision, recall, f1)

### class 0 - 乳汁吸附

In [21]:
%%time
clf_0.fit(Training_data['c0_X'], Training_data['c0_y'])

Wall time: 1min 6s


Pipeline(steps=[('anova', SelectPercentile(percentile=50)),
                ('scaler', StandardScaler()),
                ('svc', SVC(class_weight='balanced', kernel='linear'))])

In [19]:
y_pred_0 = clf_0.predict(Dev_data['c0_X_dev'])
score_0 = clf_0.score(Dev_data['c0_X_dev'], Dev_data['c0_y_dev'])
f1_score_0 = f1_score(Dev_data['c0_y_dev'], y_pred_0)

In [45]:
print('class 0:')
print('-'*10)
print('mean acc:', score_0)
print('f1 score:', f1_score_0)

class 0:
----------
mean acc: 0.6356968215158925
f1 score: 0.2332761578044597


In [39]:
(TP, FP, TN, FN, precision, recall, f1) = perf_measure(Dev_data['c0_y_dev'], y_pred_0)

TP: 204
FP: 1237
TN: 2136
FN: 104
total: 3681

Acc: 0.6356968215158925
Precision: 0.14156835530881332
Recall: 0.6623376623376623
F1: 0.2332761578044597


### class 1 - 機械傷害

In [78]:
%%time
clf_1.fit(Training_data['c1_X'], Training_data['c1_y'])

Wall time: 168 ms


Pipeline(steps=[('anova', SelectPercentile()), ('scaler', StandardScaler()),
                ('svc', SVC(C=0.1, class_weight='balanced', kernel='linear'))])

In [79]:
y_pred_1 = clf_1.predict(Dev_data['c1_X_dev'])
score_1 = clf_1.score(Dev_data['c1_X_dev'], Dev_data['c1_y_dev'])
f1_score_1 = f1_score(Dev_data['c1_y_dev'], y_pred_1)

In [80]:
print('class 1:')
print('-'*10)
print('mean acc:', score_1)
print('f1 score:', f1_score_1)

class 1:
----------
mean acc: 0.7085031241510459
f1 score: 0.0479148181011535


In [81]:
(TP, FP, TN, FN, precision, recall, f1) = perf_measure(Dev_data['c1_y_dev'], y_pred_1)

TP: 27
FP: 1040
TN: 2581
FN: 33
total: 3681

Acc: 0.7085031241510459
Precision: 0.02530459231490159
Recall: 0.45
F1: 0.0479148181011535


### class 2 - 炭疽病

In [29]:
%%time
clf_2.fit(Training_data['c2_X'], Training_data['c2_y'])

Wall time: 8min 43s


Pipeline(steps=[('anova', SelectPercentile(percentile=20)),
                ('scaler', StandardScaler()),
                ('svc', SVC(C=0.1, class_weight='balanced', kernel='linear'))])

In [26]:
y_pred_2 = clf_2.predict(Dev_data['c2_X_dev'])
score_2 = clf_2.score(Dev_data['c2_X_dev'], Dev_data['c2_y_dev'])
f1_score_2 = f1_score(Dev_data['c2_y_dev'], y_pred_2)

In [27]:
print('class 2:')
print('-'*10)
print('mean acc:', score_2)
print('f1 score:', f1_score_2)

class 2:
----------
mean acc: 0.7579462102689487
f1 score: 0.7496487777465579


In [41]:
(TP, FP, TN, FN, precision, recall, f1) = perf_measure(Dev_data['c2_y_dev'], y_pred_2)

TP: 1334
FP: 460
TN: 1456
FN: 431
total: 3681

Acc: 0.7579462102689487
Precision: 0.7435897435897436
Recall: 0.7558073654390934
F1: 0.7496487777465579


### class 3 - 著色不佳

In [34]:
%%time
clf_3.fit(Training_data['c3_X'], Training_data['c3_y'])

Wall time: 2min 41s


Pipeline(steps=[('anova', SelectPercentile()), ('scaler', StandardScaler()),
                ('svc', SVC(C=0.1, class_weight='balanced', kernel='linear'))])

In [29]:
y_pred_3 = clf_3.predict(Dev_data['c3_X_dev'])
score_3 = clf_3.score(Dev_data['c3_X_dev'], Dev_data['c3_y_dev'])
f1_score_3 = f1_score(Dev_data['c3_y_dev'], y_pred_3)

In [30]:
print('class 3:')
print('-'*10)
print('mean acc:', score_3)
print('f1 score:', f1_score_3)

class 3:
----------
mean acc: 0.7593045368106492
f1 score: 0.7712958182756842


In [42]:
(TP, FP, TN, FN, precision, recall, f1) = perf_measure(Dev_data['c3_y_dev'], y_pred_3)

TP: 1494
FP: 442
TN: 1301
FN: 444
total: 3681

Acc: 0.7593045368106492
Precision: 0.7716942148760331
Recall: 0.7708978328173375
F1: 0.7712958182756842


### class 4 - 黑斑病

In [38]:
%%time
clf_4.fit(Training_data['c4_X'], Training_data['c4_y'])

Wall time: 3.44 s


Pipeline(steps=[('anova', SelectPercentile(percentile=20)),
                ('scaler', StandardScaler()),
                ('svc', SVC(class_weight='balanced', kernel='linear'))])

In [32]:
y_pred_4 = clf_4.predict(Dev_data['c4_X_dev'])
score_4 = clf_4.score(Dev_data['c4_X_dev'], Dev_data['c4_y_dev'])
f1_score_4 = f1_score(Dev_data['c4_y_dev'], y_pred_4)

In [33]:
print('class 4:')
print('-'*10)
print('mean acc:', score_4)
print('f1 score:', f1_score_4)

class 4:
----------
mean acc: 0.7079597935343657
f1 score: 0.21475529583637692


In [43]:
(TP, FP, TN, FN, precision, recall, f1) = perf_measure(Dev_data['c4_y_dev'], y_pred_4)

TP: 147
FP: 1052
TN: 2459
FN: 23
total: 3681

Acc: 0.7079597935343657
Precision: 0.12260216847372811
Recall: 0.8647058823529412
F1: 0.21475529583637692


## f1 score

In [44]:
print('Marco f1:',np.mean([f1_score_0, f1_score_1, f1_score_2, f1_score_3, f1_score_4]))

Marco f1: 0.40337817355284644


## save, load model

**load the model from disk exapmle:**
```python
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)
```

save

In [59]:
filename = './C2_TrainDev/model/baseline/clf_0.pkl'
pickle.dump(clf_0, open(filename, 'wb'))

filename = './C2_TrainDev/model/baseline/clf_1.pkl'
pickle.dump(clf_1, open(filename, 'wb'))

filename = './C2_TrainDev/model/baseline/clf_2.pkl'
pickle.dump(clf_2, open(filename, 'wb'))

filename = './C2_TrainDev/model/baseline/clf_3.pkl'
pickle.dump(clf_3, open(filename, 'wb'))

filename = './C2_TrainDev/model/baseline/clf_4.pkl'
pickle.dump(clf_4, open(filename, 'wb'))

load

In [8]:
clf_0 = pickle.load(open('./C2_TrainDev/model/baseline/clf_0.pkl', 'rb'))
clf_1 = pickle.load(open('./C2_TrainDev/model/baseline/clf_1.pkl', 'rb'))
clf_2 = pickle.load(open('./C2_TrainDev/model/baseline/clf_2.pkl', 'rb'))
clf_3 = pickle.load(open('./C2_TrainDev/model/baseline/clf_3.pkl', 'rb'))
clf_4 = pickle.load(open('./C2_TrainDev/model/baseline/clf_4.pkl', 'rb'))