In [1]:
import pandas as pd
import spacy
import numpy as np
import glob
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.externals import joblib
import sys

In [2]:
df = pd.read_csv('../data/models/cookie_tagged.csv', sep = ';')
#df.head()

In [3]:
len(df)

552

## NBSVM

See https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline

Prepare the data frame

In [6]:
df['control'] = df.apply(lambda x: 1 if x['label'] == 0 else 0, axis = 1 )
df['dementia'] = df.apply(lambda x: 1 if x['label'] == 1 else 0, axis = 1 )
#df.head()

In [7]:
import re, string
from sklearn.linear_model import LogisticRegression

In [8]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
lens = train.text.str.len()
lens.mean(), lens.std(), lens.max()

(637.1768707482993, 331.3040151640513, 2768)

In [10]:
lens.hist();

In [11]:
label_cols = ['control', 'dementia']

In [12]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [24]:
TEXT = 'pos_text_complete'

def docs_for_column(column):
    n = train.shape[0]
    vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
    trn_term_doc = vec.fit_transform(train[column])
    test_term_doc = vec.transform(test[column])
    return trn_term_doc, test_term_doc
    
trn_term_doc, test_term_doc = docs_for_column(TEXT)

In [25]:
trn_term_doc, test_term_doc

(<441x3402 sparse matrix of type '<class 'numpy.float64'>'
 	with 103683 stored elements in Compressed Sparse Row format>,
 <111x3402 sparse matrix of type '<class 'numpy.float64'>'
 	with 25557 stored elements in Compressed Sparse Row format>)

Here's the basic naive bayes feature equation:

In [26]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

Use our vectorized documents as input.

In [27]:
x = trn_term_doc
test_x = test_term_doc

Fit a model for one dependent at a time:

In [28]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    #m = LogisticRegression(C=4, dual=True)
    m = LogisticRegression(C=12, dual=False, solver='lbfgs', class_weight='balanced')
    #m = LogisticRegression(C=1, dual=True, solver='liblinear')
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [29]:
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit control
fit dementia


In [30]:
y_hat = preds[:,1] > preds[:,0]   # bools
y_hat = [int(x) for x in y_hat]   # ints

In [31]:
print(metrics.classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.82      0.78      0.80        41
           1       0.88      0.90      0.89        70

   micro avg       0.86      0.86      0.86       111
   macro avg       0.85      0.84      0.84       111
weighted avg       0.85      0.86      0.86       111



In [32]:
print(metrics.accuracy_score(y_test, y_hat))

0.8558558558558559


### Repeat same thing for the clean text column

In [33]:
TEXT = 'text_for_POS'    
trn_term_doc, test_term_doc = docs_for_column(TEXT)

x = trn_term_doc
test_x = test_term_doc

preds = np.zeros((len(test), len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]
    
y_hat = preds[:,1] > preds[:,0]   # bools
y_hat = [int(x) for x in y_hat]   # ints

print(metrics.classification_report(y_test, y_hat))
print(metrics.accuracy_score(y_test, y_hat))

fit control
fit dementia
              precision    recall  f1-score   support

           0       0.81      0.83      0.82        41
           1       0.90      0.89      0.89        70

   micro avg       0.86      0.86      0.86       111
   macro avg       0.85      0.86      0.86       111
weighted avg       0.87      0.86      0.87       111

0.8648648648648649


-------------