In [129]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pprint import pprint
import pandas as pd
import numpy as np
import eli5
import time
from sklearn.model_selection import cross_validate

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [130]:
SIZE=1
data = pd.read_csv('data/resized_data_scaled_%d.csv'%SIZE)

# data.head(5)

In [131]:
col_names = data.columns
nvars = len(col_names)
n_rows = int(np.sqrt((nvars-1)/3.0))
n_cols = int(np.sqrt((nvars-1)/3.0))

print(col_names)
print(f"NVARS: {nvars}, NROWS: {n_rows}, NCOLS:{n_cols}")

Index(['y', 'X_0|0|0', 'X_0|0|1', 'X_0|0|2'], dtype='object')
NVARS: 4, NROWS: 1, NCOLS:1


In [132]:
Y = [[str(int(y[0]))] for y in data[['y']].values.astype(np.uint8)]
X = data[col_names[1:]].values

# Sentence is image, pixel is node (or word), and RGB are features

def get_features(sample, col_names):
    features = {col_names[i+1]: float(sample[i]) for i in range(sample.shape[0])}
    features['bias'] = 1.0
    return features

print("Features:")
pprint(get_features(X[0], col_names))

X = [[get_features(X[i], col_names)] for i in range(X.shape[0])]

pprint(X[2])

Features:
{'X_0|0|0': 0.592156862745098,
 'X_0|0|1': 0.5137254901960784,
 'X_0|0|2': 0.407843137254902,
 'bias': 1.0}
[{'X_0|0|0': 0.2352941176470588,
  'X_0|0|1': 0.207843137254902,
  'X_0|0|2': 0.1411764705882353,
  'bias': 1.0}]


In [133]:
np.random.seed(42)

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.333333)
print(len(X_train), len(y_train))
print(y_train[:10])
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

6050 6050
[['1'], ['0'], ['1'], ['0'], ['0'], ['2'], ['0'], ['3'], ['3'], ['1']]


In [135]:
start = time.time()
crf.fit(X_train, y_train)
fit_time = time.time()-start

In [136]:
y_pred_train = crf.predict(X_train)

start = time.time()
y_pred_test = crf.predict(X_test)
score_time = time.time()-start

In [137]:
print(metrics.flat_classification_report(
    y_train, y_pred_train, digits=3
))

print(metrics.flat_classification_report(
    y_test, y_pred_test, digits=3
))

              precision    recall  f1-score   support

           0      0.818     0.852     0.835      1486
           1      0.454     0.501     0.476      1511
           2      0.525     0.700     0.600      1541
           3      0.458     0.235     0.311      1512

   micro avg      0.572     0.572     0.572      6050
   macro avg      0.564     0.572     0.555      6050
weighted avg      0.562     0.572     0.554      6050

              precision    recall  f1-score   support

           0      0.825     0.840     0.832       783
           1      0.432     0.483     0.456       758
           2      0.512     0.703     0.593       728
           3      0.504     0.254     0.337       757

   micro avg      0.571     0.571     0.571      3026
   macro avg      0.568     0.570     0.555      3026
weighted avg      0.571     0.571     0.557      3026



In [138]:
eli5.show_weights(crf)

  rel_weight = (abs(weight) / weight_range) ** 0.7


From \ To,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+4.295,bias,,
-0.764,X_0|0|1,,
-2.048,X_0|0|0,,
-13.063,X_0|0|2,,
-0.478,bias,,
-0.677,X_0|0|0,,
-0.952,X_0|0|1,,
-1.174,X_0|0|2,,
+4.660,X_0|0|2,,
+2.638,X_0|0|1,,

Weight?,Feature
4.295,bias
-0.764,X_0|0|1
-2.048,X_0|0|0
-13.063,X_0|0|2

Weight?,Feature
-0.478,bias
-0.677,X_0|0|0
-0.952,X_0|0|1
-1.174,X_0|0|2

Weight?,Feature
4.66,X_0|0|2
2.638,X_0|0|1
2.236,X_0|0|0
-8.561,bias

Weight?,Feature
4.204,X_0|0|2
1.482,X_0|0|1
-1.134,X_0|0|0
-4.912,bias


In [139]:
y_train[:10]

[['3'], ['0'], ['0'], ['1'], ['2'], ['3'], ['1'], ['0'], ['2'], ['2']]

In [140]:
y_pred_train[:10]

[['2'], ['0'], ['0'], ['1'], ['1'], ['3'], ['1'], ['0'], ['2'], ['1']]

In [141]:
metrics = cross_validate(crf, X, Y, cv=3, return_train_score=True)
print(metrics)

{'fit_time': array([0.27971673, 0.26048923, 0.40167117]), 'score_time': array([0.01348829, 0.01560879, 0.02126169]), 'test_score': array([0.12723067, 0.00099174, 0.16561983]), 'train_score': array([0.57504132, 0.71095687, 0.73574616])}
