In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

from sklearn.externals import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from scipy.spatial import distance as dist

import time
from IPython.display import clear_output

# Load Dataset

In [2]:
df = pd.read_csv('.CleanDataCC.csv', index_col=0)
df.head()

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


Unnamed: 0,CLMANT_TXT,CNTNTN_CLSFCN_ID,CNTNTN_CLSFCN_TXT,modeClass,newClass
0,diabetes mellitus type 2,2085,diabetes mellitus type 2,endocrine,endocrine
1,diabetes mellitus type 2,2085,diabetes mellitus type 2,endocrine,endocrine
2,special monthly pension,8917,miscellaneous,pension aid and attendance housebound,pension - aid and attendence/housebound
3,special monthly pension,8917,miscellaneous,pension aid and attendance housebound,pension - aid and attendence/housebound
4,adjustment disorder,2030,depression,mental disorders,mental disorders


In [3]:
df.shape

(6324799, 5)

In [4]:
df = df[df['CLMANT_TXT'].str.len() < 300]
df.shape

(6324248, 5)

# Lets remove from df things that dont have a label that coincides with the official classes but keep them in a different dataframe for later.

In [5]:
dfOut = df[(~df['CLMANT_TXT'].isnull()) & (df['newClass'].isnull())]
df = df[(~df['CLMANT_TXT'].isnull()) & (~df['newClass'].isnull())]

# Initialize a Count Vectorizer with a minimum of 10 appearance of a word for significance, english stopwords and up to 3 word ngrams.

In [6]:
vectorizer = CountVectorizer(min_df=10, ngram_range=(1,3), stop_words='english') 
#vectorizer = TfidfVectorizer(min_df=10, ngram_range=(1,3), stop_words='english')

In [7]:
X = vectorizer.fit_transform(df['CLMANT_TXT'])

In [8]:
y = df['newClass']

# Split first to obtain Training set

In [9]:
X_train, X_test, y_train, y_test, i_train, i_test = train_test_split(X, y, df.index, test_size=0.6, random_state=42)

# Split again test to obtain testing and validation

In [10]:
X_test, X_validate, y_test, y_validate, i_test, i_validate = train_test_split(X_test, y_test, i_test, 
                                                                              test_size=0.5, random_state=42)

# Initialize a Logistic Regression Classifier

In [11]:
clf = LogisticRegression(n_jobs=-1)
#clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

In [12]:
clf.fit(X_train, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=None, solver='warn', tol=0.0001,
          verbose=0, warm_start=False)

# Predict the value of our testing set

In [13]:
y_validate_pred = clf.predict(X_validate)
y_validate_pred

array(['musculoskeletal - foot',
       'neurological - cranial/peripheral nerves',
       'musculoskeletal - shoulder', ..., 'musculoskeletal - hip',
       'digestive',
       'infectious diseases, immune disorders & nutritional deficiencies'],
      dtype=object)

In [14]:
y_validate_pred[:100]

array(['musculoskeletal - foot',
       'neurological - cranial/peripheral nerves',
       'musculoskeletal - shoulder', 'musculoskeletal - knee', 'skin',
       'special monthly compensation - aid and attendence/housebound',
       'musculoskeletal - mid/lower back (thoracolumbar spine)',
       'hearing loss', 'musculoskeletal - other',
       'musculoskeletal - arthritis', 'hearing loss', 'eye (vision)',
       'hearing loss', 'hearing loss', 'musculoskeletal - shoulder',
       'musculoskeletal - neck/upper back (cervical spine)',
       'hearing loss', 'hearing loss', 'musculoskeletal - ankle',
       'mental disorders', 'cyst/benign growth - musculoskeletal - other',
       'gulf war undiagnosed illness', 'skin', 'heart/veins/arteries',
       'musculoskeletal and muscle injuries', 'musculoskeletal - foot',
       'cancer - skin',
       'musculoskeletal - mid/lower back (thoracolumbar spine)',
       'post traumatic stress disorder (ptsd) combat - mental disorders',
       'ment

In [15]:
list(y_validate)

['musculoskeletal - foot',
 'neurological - cranial/peripheral nerves',
 'musculoskeletal - shoulder',
 'musculoskeletal - knee',
 'infectious diseases, immune disorders & nutritional deficiencies',
 'special monthly compensation - aid and attendence/housebound',
 'leg condition - neurological other system',
 'hearing loss',
 'musculoskeletal - other',
 'musculoskeletal - arthritis',
 'hearing loss',
 'eye (vision)',
 'hearing loss',
 'hearing loss',
 'musculoskeletal - shoulder',
 'musculoskeletal - neck/upper back (cervical spine)',
 'hearing loss',
 'hearing loss',
 'musculoskeletal - ankle',
 'mental disorders',
 'cyst/benign growth - musculoskeletal - other',
 'gulf war undiagnosed illness',
 'skin',
 'heart/veins/arteries',
 'musculoskeletal - other',
 'musculoskeletal - foot',
 'musculoskeletal - other',
 'musculoskeletal - mid/lower back (thoracolumbar spine)',
 'post traumatic stress disorder (ptsd) combat - mental disorders',
 'mental disorders',
 'hearing loss',
 'post traum

# Lets check the models accuracy

In [16]:
clf.score(X_validate,y_validate)

0.9097379135865913

# How about Presicion, Recall and f1 score

In [17]:
precision_recall_fscore_support(y_validate, y_validate_pred, average='weighted')

  'precision', 'predicted', average, warn_for)


(0.9030600140233641, 0.9097379135865913, 0.9040597044656684, None)

# How about a confusion Matrix

In [18]:
confusion_matrix(y_validate, y_validate_pred)

array([[ 1895,     2,     2, ...,     0,     9,     9],
       [    8,    67,     0, ...,     0,     1,     0],
       [    0,     0,    21, ...,     0,     0,     0],
       ...,
       [    1,     0,     0, ...,    23,     0,     0],
       [    9,     0,     0, ...,     0, 27012,     0],
       [    2,     0,     0, ...,     0,     0,   338]])

In [19]:
results = pd.DataFrame()
results['validate'] = y_validate
results['validatePred'] = y_validate_pred

In [20]:
results['claims'] = df['CLMANT_TXT'].loc[i_test]

In [21]:
results['correctPred'] = results.apply(lambda x: int(x['validate'] == x['validatePred']), 1)

In [22]:
results.groupby('validatePred')['correctPred'].mean().sort_values(ascending=False)

validatePred
cyst/benign growth - dental and oral                                       1.000000
adhesions - respiratory                                                    1.000000
pension - aid and attendence/housebound                                    0.986939
hearing loss                                                               0.986892
pension                                                                    0.986715
service connected death                                                    0.980113
unemployability                                                            0.973020
eye (vision)                                                               0.954882
respiratory                                                                0.954872
musculoskeletal - ankle                                                    0.950974
administrative issue                                                       0.950506
musculoskeletal - knee                                         

In [23]:
results.groupby('validate')['correctPred'].mean().sort_values(ascending=False)[:32]

validate
hearing loss                                                       0.994927
pension                                                            0.987322
service connected death                                            0.986515
musculoskeletal - knee                                             0.973684
pension - aid and attendence/housebound                            0.973477
musculoskeletal - shoulder                                         0.971953
musculoskeletal - ankle                                            0.971533
eye (vision)                                                       0.965760
respiratory                                                        0.965193
musculoskeletal - hip                                              0.963634
post traumatic stress disorder (ptsd) combat - mental disorders    0.961523
heart/veins/arteries                                               0.960105
mental disorders                                                   0.955517
une