In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../data/irtm.csv')

In [None]:
df.head()

In [None]:
df[[len(str(c)) < 100 for c in df['content']]]

In [None]:
df = df[np.array([len(str(c)) >= 100 for c in df['content']]) & np.array([cls in ['euclid', 'keter', 'safe'] for cls in df['object_class']])].reset_index(drop=True)
df.head()

In [None]:
df.shape

### Create dictionary

In [None]:
from pa2 import make_dict
# dic = make_dict(df['content'])

In [None]:
from pa2 import save_dict
# save_dict(dic, 'dictionary.txt')

### Create vectors

In [None]:
from pa2 import create_vector
# create_vector(df['content'], df.index)

### Get dataset

In [None]:
from pa3_NB import *

In [None]:
fact = pd.factorize(df.object_class)

In [None]:
len(fact[1])

In [None]:
dataset = Dataset(np.arange(5365), fact[0]+1)

In [None]:
terms = read_dict(Config.dictionary_file)

In [None]:
plt.hist([df for i, (string, df, cf) in terms], density=True)
plt.title('Document Frequency')
plt.show()

In [None]:
plt.hist([cf for i, (string, df, cf) in terms], density=True)
plt.title('Collection Frequency')
plt.show()

### Feature Selection

In [None]:
dictionary = max_seg_chi2(terms, dataset, size=150)

In [None]:
clf = Classifier(dictionary, class_size=3)

In [None]:
ma_train_f1, mi_train_f1, ma_valid_f1, mi_valid_f1 = cross_validation(clf, dataset, seed=1126)

In [None]:
print(f'macro f1 (train): {ma_train_f1}')
print(f'micro f1 (train): {mi_train_f1}')
print(f'macro f1 (valid): {ma_valid_f1}')
print(f'micro f1 (valid): {mi_valid_f1}')

### Confusion Matrix

In [None]:
def confusion_matrix(clf, dataset, fold=10, permutation=True, seed=None):
  if seed:
    np.random.seed(seed)
    random.seed(seed)
  n = len(dataset)
  if permutation:
    permute = np.random.permutation(n)
  else:
    permute = np.arange(n)
  width = int(np.ceil(n / fold))
  left = 0
  right = width
  mat = np.zeros((Config.class_size, Config.class_size))
  for i in range(fold):
    train_permute = np.concatenate((permute[:left], permute[right:]))
    test_permute = permute[left:right]
#     test_permute = np.concatenate((permute[:left], permute[right:]))
#     train_permute = permute[left:right]
    left += width
    right += width
    dataset_train, dataset_test = train_test_split(dataset, train_permute, test_permute)
    clf.train(dataset_train)
    pred = clf.predict(dataset_test)
#     print(min(dataset_test.y), max(dataset_test.y))
#     print(np.unique(pred[:,1], return_counts=True))
    for i in range(len(dataset_test)):
        mat[pred[i,1]-1, int(dataset_test.y[i]-1)] += 1
  mat /= fold
  return mat

In [None]:
mat = confusion_matrix(clf, dataset)

- safe < euclid < keter
- thaumiel
- neutralized

In [None]:
import seaborn as sns
print(fact[1])
sns.heatmap(mat, annot=True,  linewidths=.5)
plt.show()

In [None]:
print('Precision')
print(fact[1])
print(np.diag(mat) / np.sum(mat, axis=1))

In [None]:
print('Recall')
print(fact[1])
print(np.diag(mat) / np.sum(mat, axis=0))

In [None]:
print('F1')
print(fact[1])
p = np.diag(mat) / np.sum(mat, axis=1)
r = np.diag(mat) / np.sum(mat, axis=0)
f = 2 * p * r / (p + r)
print(f)

In [None]:
fig, ax = plt.subplots()

x = np.arange(len(fact[1]))

width = 0.2
rects1 = ax.bar(x - width, p, width, label='Precision')
rects2 = ax.bar(x, r, width, label='Recall')
rects1 = ax.bar(x + width, f, width, label='F1')

ax.set_ylabel('Scores')
ax.set_title('Scores by class')
ax.set_xticks(x)
ax.set_xticklabels(fact[1])
ax.legend()

plt.show()

In [None]:
sum(df.object_class == 'thaumiel')

### Check the selected terms

In [None]:
np.array(terms, dtype=object)[np.array(dictionary)-1]