# baseline

In [1]:
import pickle
import numpy as np

from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectPercentile

## Loading data

In [2]:
Training_data = dict()
Dev_data = dict()

for i in ['c0', 'c1', 'c2', 'c3', 'c4']:
    with open('./C2_TrainDev/processed_data/baseline_crop/train/'+i+'_X.pkl', 'rb') as f:
        Training_data[i+'_X'] = pickle.load(f)

    with open('./C2_TrainDev/processed_data/baseline_crop/train/'+i+'_y.pkl', 'rb') as f:
        Training_data[i+'_y'] = pickle.load(f)

    with open('./C2_TrainDev/processed_data/baseline_crop/dev/'+i+'_X_dev.pkl', 'rb') as f:
        Dev_data[i+'_X_dev'] = pickle.load(f)

    with open('./C2_TrainDev/processed_data/baseline_crop/dev/'+i+'_y_dev.pkl', 'rb') as f:
        Dev_data[i+'_y_dev'] = pickle.load(f)

## change dimension to fit svm

In [3]:
for data in ['c0_X','c1_X','c2_X','c3_X','c4_X']:
    list_ = Training_data[data]
    for i,j in enumerate(list_):
        list_[i] = np.squeeze(j)
    Training_data[data] = list_
        
for data in ['c0_X_dev','c1_X_dev','c2_X_dev','c3_X_dev','c4_X_dev']:   
    list_ = Dev_data[data]
    for i,j in enumerate(list_):
        list_[i] = np.squeeze(j)
    Dev_data[data] = list_

In [4]:
for data in ['c0_y','c1_y','c2_y','c3_y','c4_y']:
    list_ = Training_data[data]
    list_ = np.array(list_)
    Training_data[data] = list_
    
for data in ['c0_y_dev','c1_y_dev','c2_y_dev','c3_y_dev','c4_y_dev']:
    list_ = Dev_data[data]
    list_ = np.array(list_)
    Dev_data[data] = list_

## check the amount of data

In [5]:
print('train data\n')

for i in ['c0', 'c1', 'c2', 'c3', 'c4']:
    print('['+i+']')
    print('X:',len(Training_data[i+'_X']))
    print('y:',len(Training_data[i+'_y']))
    print('-'*20)

train data

[c0]
X: 5158
y: 5158
--------------------
[c1]
X: 1004
y: 1004
--------------------
[c2]
X: 39566
y: 39566
--------------------
[c3]
X: 30090
y: 30090
--------------------
[c4]
X: 3314
y: 3314
--------------------


In [6]:
print('Dev data\n')

for i in ['c0', 'c1', 'c2', 'c3', 'c4']:
    print('['+i+']')
    print('X:',len(Dev_data[i+'_X_dev']))
    print('y:',len(Dev_data[i+'_y_dev']))
    print('-'*20)

Dev data

[c0]
X: 6280
y: 6280
--------------------
[c1]
X: 6280
y: 6280
--------------------
[c2]
X: 6280
y: 6280
--------------------
[c3]
X: 6280
y: 6280
--------------------
[c4]
X: 6280
y: 6280
--------------------


## SVM

Definding svm model

In [7]:
clf_0 = Pipeline([
    ('anova', SelectPercentile(percentile=50)),
    ('scaler', StandardScaler()),
    ('svc',SVC(C=1.0, kernel='linear', class_weight='balanced'))
])
#===================================================================
clf_1 = Pipeline([
    ('anova', SelectPercentile(percentile=10)),
    ('scaler', StandardScaler()),
    ('svc',SVC(C=0.1, kernel='linear', class_weight='balanced'))
])
#===================================================================
clf_2 = Pipeline([
    ('anova', SelectPercentile(percentile=20)),
    ('scaler', StandardScaler()),
    ('svc',SVC(C=0.1, kernel='linear', class_weight='balanced'))
])
#===================================================================
clf_3 = Pipeline([
    ('anova', SelectPercentile(percentile=10)),
    ('scaler', StandardScaler()),
    ('svc',SVC(C=0.1, kernel='linear', class_weight='balanced'))
])
#===================================================================
clf_4 = Pipeline([
    ('anova', SelectPercentile(percentile=20)),
    ('scaler', StandardScaler()),
    ('svc',SVC(C=1.0, kernel='linear', class_weight='balanced'))
])

calc metrics

In [8]:
def perf_measure(y_actual, y_pred):
    """
    計算metric
    """
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_pred)): 
        if y_actual[i]==y_pred[i]==1:
           TP += 1
        if y_pred[i]==1 and y_actual[i]!=y_pred[i]:
           FP += 1
        if y_actual[i]==y_pred[i]==0:
           TN += 1
        if y_pred[i]==0 and y_actual[i]!=y_pred[i]:
           FN += 1
    
    precision = TP / (TP+FP)
    recall = TP / (TP+FN)
    f1 = 2 * precision * recall / (precision + recall)  
    
    print('TP:',TP)
    print('FP:',FP)
    print('TN:',TN)
    print('FN:',FN)
    print('total:',TP+FP+TN+FN)
    print()
    print('Acc:',(TP+TN)/(TP+FP+TN+FN))
    print('Precision:',precision)
    print('Recall:',recall)
    print('F1:',f1)
    
    return (TP, FP, TN, FN, precision, recall, f1)

### class 0 - 乳汁吸附

In [9]:
%%time
clf_0.fit(Training_data['c0_X'], Training_data['c0_y'])

Wall time: 48.3 s


Pipeline(steps=[('anova', SelectPercentile(percentile=50)),
                ('scaler', StandardScaler()),
                ('svc', SVC(class_weight='balanced', kernel='linear'))])

In [10]:
y_pred_0 = clf_0.predict(Dev_data['c0_X_dev'])
score_0 = clf_0.score(Dev_data['c0_X_dev'], Dev_data['c0_y_dev'])
f1_score_0 = f1_score(Dev_data['c0_y_dev'], y_pred_0)

In [11]:
print('class 0:')
print('-'*10)
print('mean acc:', score_0)
print('f1 score:', f1_score_0)

class 0:
----------
mean acc: 0.6985668789808918
f1 score: 0.24310275889644137


In [12]:
(TP, FP, TN, FN, precision, recall, f1) = perf_measure(Dev_data['c0_y_dev'], y_pred_0)

TP: 304
FP: 1806
TN: 4083
FN: 87
total: 6280

Acc: 0.6985668789808918
Precision: 0.14407582938388624
Recall: 0.7774936061381074
F1: 0.24310275889644137


### class 1 - 機械傷害

In [13]:
%%time
clf_1.fit(Training_data['c1_X'], Training_data['c1_y'])

Wall time: 185 ms


Pipeline(steps=[('anova', SelectPercentile()), ('scaler', StandardScaler()),
                ('svc', SVC(C=0.1, class_weight='balanced', kernel='linear'))])

In [14]:
y_pred_1 = clf_1.predict(Dev_data['c1_X_dev'])
score_1 = clf_1.score(Dev_data['c1_X_dev'], Dev_data['c1_y_dev'])
f1_score_1 = f1_score(Dev_data['c1_y_dev'], y_pred_1)

In [15]:
print('class 1:')
print('-'*10)
print('mean acc:', score_1)
print('f1 score:', f1_score_1)

class 1:
----------
mean acc: 0.6488853503184714
f1 score: 0.05242801890846584


In [16]:
(TP, FP, TN, FN, precision, recall, f1) = perf_measure(Dev_data['c1_y_dev'], y_pred_1)

TP: 61
FP: 2176
TN: 4014
FN: 29
total: 6280

Acc: 0.6488853503184714
Precision: 0.027268663388466695
Recall: 0.6777777777777778
F1: 0.05242801890846584


### class 2 - 炭疽病

In [17]:
%%time
clf_2.fit(Training_data['c2_X'], Training_data['c2_y'])

Wall time: 16min 56s


Pipeline(steps=[('anova', SelectPercentile(percentile=20)),
                ('scaler', StandardScaler()),
                ('svc', SVC(C=0.1, class_weight='balanced', kernel='linear'))])

In [18]:
y_pred_2 = clf_2.predict(Dev_data['c2_X_dev'])
score_2 = clf_2.score(Dev_data['c2_X_dev'], Dev_data['c2_y_dev'])
f1_score_2 = f1_score(Dev_data['c2_y_dev'], y_pred_2)

In [19]:
print('class 2:')
print('-'*10)
print('mean acc:', score_2)
print('f1 score:', f1_score_2)

class 2:
----------
mean acc: 0.8775477707006369
f1 score: 0.8942955326460481


In [20]:
(TP, FP, TN, FN, precision, recall, f1) = perf_measure(Dev_data['c2_y_dev'], y_pred_2)

TP: 3253
FP: 518
TN: 2258
FN: 251
total: 6280

Acc: 0.8775477707006369
Precision: 0.8626359055953328
Recall: 0.9283675799086758
F1: 0.8942955326460481


### class 3 - 著色不佳

In [21]:
%%time
clf_3.fit(Training_data['c3_X'], Training_data['c3_y'])

Wall time: 1min 21s


Pipeline(steps=[('anova', SelectPercentile()), ('scaler', StandardScaler()),
                ('svc', SVC(C=0.1, class_weight='balanced', kernel='linear'))])

In [22]:
y_pred_3 = clf_3.predict(Dev_data['c3_X_dev'])
score_3 = clf_3.score(Dev_data['c3_X_dev'], Dev_data['c3_y_dev'])
f1_score_3 = f1_score(Dev_data['c3_y_dev'], y_pred_3)

In [23]:
print('class 3:')
print('-'*10)
print('mean acc:', score_3)
print('f1 score:', f1_score_3)

class 3:
----------
mean acc: 0.938375796178344
f1 score: 0.9064539521392313


In [24]:
(TP, FP, TN, FN, precision, recall, f1) = perf_measure(Dev_data['c3_y_dev'], y_pred_3)

TP: 1875
FP: 265
TN: 4018
FN: 122
total: 6280

Acc: 0.938375796178344
Precision: 0.8761682242990654
Recall: 0.9389083625438157
F1: 0.9064539521392313


### class 4 - 黑斑病

In [25]:
%%time
clf_4.fit(Training_data['c4_X'], Training_data['c4_y'])

Wall time: 10.4 s


Pipeline(steps=[('anova', SelectPercentile(percentile=20)),
                ('scaler', StandardScaler()),
                ('svc', SVC(class_weight='balanced', kernel='linear'))])

In [26]:
y_pred_4 = clf_4.predict(Dev_data['c4_X_dev'])
score_4 = clf_4.score(Dev_data['c4_X_dev'], Dev_data['c4_y_dev'])
f1_score_4 = f1_score(Dev_data['c4_y_dev'], y_pred_4)

In [27]:
print('class 4:')
print('-'*10)
print('mean acc:', score_4)
print('f1 score:', f1_score_4)

class 4:
----------
mean acc: 0.6523885350318471
f1 score: 0.1632809505557685


In [28]:
(TP, FP, TN, FN, precision, recall, f1) = perf_measure(Dev_data['c4_y_dev'], y_pred_4)

TP: 213
FP: 2098
TN: 3884
FN: 85
total: 6280

Acc: 0.6523885350318471
Precision: 0.09216789268714842
Recall: 0.714765100671141
F1: 0.1632809505557685


## f1 score

In [29]:
print('Marco f1:',np.mean([f1_score_0, f1_score_1, f1_score_2, f1_score_3, f1_score_4]))

Marco f1: 0.45191224262919105


## save, load model

**load the model from disk exapmle:**
```python
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)
```

save

In [30]:
filename = './C2_TrainDev/model/baseline_crop/clf_0.pkl'
pickle.dump(clf_0, open(filename, 'wb'))

filename = './C2_TrainDev/model/baseline_crop/clf_1.pkl'
pickle.dump(clf_1, open(filename, 'wb'))

filename = './C2_TrainDev/model/baseline_crop/clf_2.pkl'
pickle.dump(clf_2, open(filename, 'wb'))

filename = './C2_TrainDev/model/baseline_crop/clf_3.pkl'
pickle.dump(clf_3, open(filename, 'wb'))

filename = './C2_TrainDev/model/baseline_crop/clf_4.pkl'
pickle.dump(clf_4, open(filename, 'wb'))

load

In [8]:
clf_0 = pickle.load(open('./C2_TrainDev/model/baseline_crop/clf_0.pkl', 'rb'))
clf_1 = pickle.load(open('./C2_TrainDev/model/baseline_crop/clf_1.pkl', 'rb'))
clf_2 = pickle.load(open('./C2_TrainDev/model/baseline_crop/clf_2.pkl', 'rb'))
clf_3 = pickle.load(open('./C2_TrainDev/model/baseline_crop/clf_3.pkl', 'rb'))
clf_4 = pickle.load(open('./C2_TrainDev/model/baseline_crop/clf_4.pkl', 'rb'))