In [1]:
import pandas as pd
import spacy
import glob
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.externals import joblib
import sys

In [48]:
#df = pd.read_csv('../data/models/cookie_tagged_551.csv', sep = ';')
df = pd.read_csv('../data/models/cookie_tagged.csv', sep = ';')
#df.head()

In [3]:
X = df['pos_text_complete'] 
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
print(y_test.sum()/len(y_test))
print(y.sum()/len(y))

0.6306306306306306
0.5597826086956522


In [17]:
pipeline = Pipeline(steps=[('vect', CountVectorizer()), 
                           ('cls', SVC()), 
                    ])

parameters = {          
              'cls__C':  (0.001, 0.01, 1, 10), 
              'cls__kernel':  ['linear', 'poly', 'rbf'],
              'cls__degree' : (2,3,4),
              'cls__gamma' : [0.0001,0.001,0.01,0.1,1] 
              }

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=parameters,
                           scoring='roc_auc',
                           verbose=3,
                           n_jobs = -1,
                           cv=10                        
                          )
                    

grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 1120 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 1536 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 1800 out of 1800 | elapsed:   39.6s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'cls__C': (0.001, 0.01, 1, 10), 'cls__kernel': ['linear', 'poly', 'rbf'], 'cls__degree': (2, 3, 4), 'cls__gamma': [0.0001, 0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [18]:
grid_search.best_params_

{'cls__C': 0.01,
 'cls__degree': 2,
 'cls__gamma': 0.0001,
 'cls__kernel': 'linear'}

In [19]:
best_svm = grid_search.best_estimator_

In [20]:
# Form a prediction set
predictions = best_svm.predict(X_test)

# Report the confusion matrix
confusion_matrix_model =(metrics.confusion_matrix(y_test,predictions)) 
target = ['Control', 'Dementia']

confusion_matrix_df = pd.DataFrame(confusion_matrix_model, index=target, columns=target)

confusion_matrix_df.columns.name = "Predicted"
confusion_matrix_df.index.name = "Real"

confusion_matrix_df

Predicted,Control,Dementia
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
Control,35,6
Dementia,5,65


In [21]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.88      0.85      0.86        41
           1       0.92      0.93      0.92        70

   micro avg       0.90      0.90      0.90       111
   macro avg       0.90      0.89      0.89       111
weighted avg       0.90      0.90      0.90       111



In [22]:
print(metrics.accuracy_score(y_test,predictions))

0.9009009009009009


In [23]:
joblib.dump(best_svm, 'best_svm.sav' )

['best_svm.sav']

## Use clear text for App model

In [24]:
X = df['text_for_POS'] 
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
pipeline = Pipeline(steps=[('vect', CountVectorizer()), 
                           ('cls', SVC())])
                    
 

parameters = {
              'cls__C': [0.01],              
              'cls__kernel': ['linear'],          
              'cls__probability' : [True]
             }



grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=parameters,
                           scoring='roc_auc',
                           verbose=3,
                           n_jobs = -1,
                           cv=10                       
                          )
                         


grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.3s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.4s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'cls__C': [0.01], 'cls__kernel': ['linear'], 'cls__probability': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [42]:
grid_search.best_params_

{'cls__C': 0.01, 'cls__kernel': 'linear', 'cls__probability': True}

In [43]:
best_svm_app = grid_search.best_estimator_

In [44]:
# Form a prediction set
predictions = best_svm_app.predict(X_test)

# Report the confusion matrix
confusion_matrix_model =(metrics.confusion_matrix(y_test,predictions)) 
target = ['Control', 'Dementia']

confusion_matrix_df = pd.DataFrame(confusion_matrix_model, index=target, columns=target)

confusion_matrix_df.columns.name = "Predicted"
confusion_matrix_df.index.name = "Real"

confusion_matrix_df

Predicted,Control,Dementia
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
Control,32,9
Dementia,5,65


In [45]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.86      0.78      0.82        41
           1       0.88      0.93      0.90        70

   micro avg       0.87      0.87      0.87       111
   macro avg       0.87      0.85      0.86       111
weighted avg       0.87      0.87      0.87       111



In [46]:
print(metrics.accuracy_score(y_test,predictions))

0.8738738738738738


In [47]:
joblib.dump(best_svm_app, 'best_svm_app.sav' )

['best_svm_app.sav']

In [40]:
loaded_model = joblib.load('best_svm_app.sav')
loaded_model

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ar', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])