In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

from sklearn.externals import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from scipy.spatial import distance as dist

import time
from IPython.display import clear_output

# Load Dataset

In [2]:
df = pd.read_csv('../data/CleanData.csv', index_col=0)
df.head()

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


Unnamed: 0,INDEX,CLMANT_TXT,CNTNTN_CLSFCN_ID,CNTNTN_CLSFCN_TXT,modeClass,newClass
0,1,diabetes mellitus type 2,2085,diabetes mellitus type 2,endocrine,endocrine
1,2,diabetes mellitus type 2,2085,diabetes mellitus type 2,endocrine,endocrine
2,3,special monthly pension,8917,miscellaneous,pension aid and attendance housebound,pension - aid and attendence/housebound
3,4,special monthly pension,8917,miscellaneous,pension aid and attendance housebound,pension - aid and attendence/housebound
4,5,adjustment disorder,2030,depression,mental disorders,mental disorders


In [3]:
df.shape

(6324799, 6)

In [4]:
df = df[df['CLMANT_TXT'].str.len() < 300]
df.shape

(6324248, 6)

# Lets remove from df things that dont have a label that coincides with the official classes but keep them in a different dataframe for later.

In [5]:
dfOut = df[(~df['CLMANT_TXT'].isnull()) & (df['newClass'].isnull())]
df = df[(~df['CLMANT_TXT'].isnull()) & (~df['newClass'].isnull())]

# Initialize a Count Vectorizer with a minimum of 10 appearance of a word for significance, english stopwords and up to 3 word ngrams.

In [6]:
vectorizer = CountVectorizer(min_df=10, ngram_range=(1,3), stop_words='english') 
#vectorizer = TfidfVectorizer(min_df=10, ngram_range=(1,3), stop_words='english')

In [7]:
X = vectorizer.fit_transform(df['CLMANT_TXT'])

In [8]:
y = np.array(df['newClass'])

In [9]:
X[:1]

<1x201833 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [10]:
y

array(['endocrine', 'endocrine',
       'pension - aid and attendence/housebound', ...,
       'pension - aid and attendence/housebound',
       'musculoskeletal - knee',
       'post traumatic stress disorder (ptsd) combat - mental disorders'],
      dtype=object)

# Split first to obtain Training set

In [11]:
X_train, X_test, y_train, y_test, i_train, i_test = train_test_split(X, y, df.index, test_size=0.7, random_state=42)

# Initialize a Logistic Regression Classifier

In [12]:
clf = LogisticRegression(multi_class='ovr', solver='lbfgs', n_jobs=-1)

In [13]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

# Predict the value of our testing set

In [14]:
y_pred = clf.predict(X_test)
y_pred

array(['musculoskeletal - ankle', 'hearing loss', 'respiratory', ...,
       'musculoskeletal - foot', 'unemployability', 'skin'], dtype=object)

In [15]:
y_pred[:100]

array(['musculoskeletal - ankle', 'hearing loss', 'respiratory',
       'musculoskeletal - neck/upper back (cervical spine)',
       'musculoskeletal - shoulder', 'musculoskeletal - foot',
       'musculoskeletal - foot', 'heart/veins/arteries',
       'mental disorders',
       'musculoskeletal - mid/lower back (thoracolumbar spine)',
       'leg condition - neurological other system',
       'neurological - cranial/peripheral nerves',
       'special monthly compensation - aid and attendence/housebound',
       'post traumatic stress disorder (ptsd) combat - mental disorders',
       'musculoskeletal - mid/lower back (thoracolumbar spine)', 'skin',
       'musculoskeletal - hand', 'musculoskeletal - arthritis',
       'respiratory', 'neurological - cranial/peripheral nerves',
       'musculoskeletal - shoulder', 'administrative issue',
       'respiratory', 'hearing loss', 'scars (extremeties and trunk)',
       'endocrine', 'genitourinary', 'musculoskeletal - knee',
       'musculos

In [17]:
list(y_test)[:100]

['musculoskeletal - ankle',
 'hearing loss',
 'respiratory',
 'musculoskeletal - neck/upper back (cervical spine)',
 'musculoskeletal - shoulder',
 'musculoskeletal - foot',
 'musculoskeletal - foot',
 'heart/veins/arteries',
 'mental disorders',
 'musculoskeletal - mid/lower back (thoracolumbar spine)',
 'neurological other system',
 'neurological - cranial/peripheral nerves',
 'special monthly compensation - aid and attendence/housebound',
 'post traumatic stress disorder (ptsd) combat - mental disorders',
 'musculoskeletal - mid/lower back (thoracolumbar spine)',
 'skin',
 'musculoskeletal - hand',
 'musculoskeletal - arthritis',
 'respiratory',
 'neurological - cranial/peripheral nerves',
 'musculoskeletal - shoulder',
 'administrative issue',
 'respiratory',
 'hearing loss',
 'scars (extremeties and trunk)',
 'endocrine',
 'genitourinary',
 'musculoskeletal - knee',
 'musculoskeletal - foot',
 'pension',
 'musculoskeletal - knee',
 'ear disease and other sense organs',
 'administr

# Lets check the models accuracy

In [20]:
clf.predict(vectorizer.transform(['ringing in my ear']))

array(['hearing loss'], dtype=object)

In [21]:
clf.score(X_test,y_test)

0.9165259551883319

# How about Presicion, Recall and f1 score

In [22]:
precision_recall_fscore_support(y_test, y_pred, average='weighted')

  'precision', 'predicted', average, warn_for)


(0.913663704178678, 0.9165259551883319, 0.9134268738684035, None)

# How about a confusion Matrix

In [23]:
confusion_matrix(y_test, y_pred)

array([[  169,     0,     0, ...,     0,     2,     0],
       [    0,    60,     0, ...,     0,     0,     0],
       [    1,     0,    66, ...,     0,     0,     0],
       ...,
       [    0,     0,     0, ...,    55,     0,     0],
       [    0,     0,     0, ...,     0, 62403,     2],
       [    0,     0,     0, ...,     0,     0,   843]])

In [27]:
results = pd.DataFrame()
results['label'] = y_test
results['predictedLabel'] = y_pred
results['claims'] = df['CLMANT_TXT'].loc[i_test]
results['correctPred'] = results.apply(lambda x: int(x['label'] == x['predictedLabel']), 1)
results = results[['claims', 'label', 'predictedLabel', 'correctPred']]
results.to_csv('../data/testResults.csv')

In [28]:
results.groupby('predictedLabel')['correctPred'].mean().sort_values(ascending=False)

predictedLabel
pension - aid and attendence/housebound                                        0.985628
pension                                                                        0.985081
hearing loss                                                                   0.985028
service connected death                                                        0.982091
unemployability                                                                0.971888
musculoskeletal - shoulder                                                     0.962083
musculoskeletal - ankle                                                        0.961514
musculoskeletal - knee                                                         0.960883
respiratory                                                                    0.960039
eye (vision)                                                                   0.959526
administrative issue                                                           0.947940
musculoskeletal -

In [29]:
results.groupby('label')['correctPred'].mean().sort_values(ascending=False)[:32]

label
hearing loss                                                       0.993904
pension                                                            0.986565
service connected death                                            0.982001
musculoskeletal - knee                                             0.975812
musculoskeletal - ankle                                            0.973834
musculoskeletal - shoulder                                         0.973624
pension - aid and attendence/housebound                            0.972173
respiratory                                                        0.967944
eye (vision)                                                       0.965917
musculoskeletal - hip                                              0.963797
post traumatic stress disorder (ptsd) combat - mental disorders    0.961109
unemployability                                                    0.959839
heart/veins/arteries                                               0.957876
muscul

In [30]:
# save the vectorizer object as vectorizer.pkl
joblib.dump(vectorizer, filename='../modelsAndTransformations/vectorizer.pkl')

# save the classifier object as LRclf.pkl
joblib.dump(clf, filename='../modelsAndTransformations/LRclf.pkl')

['../modelsAndTransformations/LRclf.pkl']