# Classification Techniques

### Imports

In [40]:
import pandas as pd
import numpy as np
import json
from matplotlib import pyplot as plt
import seaborn as sns
import string
from sklearn.metrics import f1_score as f1, confusion_matrix as confusion, plot_roc_curve as roc
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import xgboost as xgb
from xgboost import plot_importance
from nltk.tokenize.treebank import TreebankWordTokenizer

### Read in Comments and True Classifications

In [41]:
with open("unique_comments2018.json") as f:
    texts = json.load(f)

In [42]:
texts = {key:value.replace("\r", " ").replace("\n", " ").replace("\s", " ") for key, value in texts.items()}

In [43]:
for key, value in texts.items():
    texts[key] = ''.join(c for c in value if c in string.printable)

In [44]:
metadata_url = "https://mikeanders.org/data/CMS/CMS-2018-0101-0001/CMS-1701-P%20Comment%20MetaData.csv"
data = pd.read_csv(metadata_url, usecols=range(0,36))[:468] #ignore last few columns and blank rows at end of csv 
data = data.rename(columns=lambda x: x.strip()) #strip whitespace from columns

In [45]:
data10 = data.fillna(0) #fill NaN with 0
section_cols = data10.columns[3:] 
data10[section_cols] = data10[section_cols].replace(["Y"], 1) #replace Y with 1 in approriate columns
data11 = data10.copy()
section_cols1  = data11.columns[3:] 
data11[section_cols1] = np.where((data11[section_cols1]  != 1),0,data11[section_cols1] )

# Combining columns for index matching: (A6b, A6b.1, = A6b),  (C3b, C3b.1'= C3b) ('A7', 'A7.1', 'A7.2', = A7b, a7c),  (F = F2, F3)
data11['A6b'] = (data11['A6b'] + data11['A6b.1'])
data11['A6b'] = data11['A6b'].replace(2,1)
data11['C3b'] = (data11['C3b'] + data11['C3b.1'])
data11['C3b'] = data11['C3b'].replace(2,1)
data11['A7'] = (data11['A7'] + data11['A7.1'] + data11['A7.2'])
data11['A7'] = data11['A7'].replace(2,1)
data11['A7'] = data11['A7'].replace(3,1)
data11 = data11.drop(['A6b.1', 'C3b.1', 'A7.1', 'A7.2'], axis = 1)

data11.Name = [name.split('DRAFT-')[1].split('-')[0] for name in data11.Name]
data11 = data11.rename(columns=lambda x: x.lower())
section_cols1 = data11.columns[3:]

In [46]:
data11 = data11.loc[data11['name'].isin(texts.keys())]
data11["comment"] = texts.values() 

### Train/Test Split

In [47]:
train = data11.sample(frac=.75, random_state=44)
test = data11.drop(train.index)

In [48]:
train_texts = list(train.comment)
test_texts = list(test.comment)

In [49]:
train

Unnamed: 0,name,organization name / submitter name,submitter state,a2,a3,a4b,a4c,a5b,a5c,a5d,...,d3d,d4,e2,e3,e4,e5,e6,e7,f,comment
465,0467,Washington State Hospital Association,WA,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"see attached file(s)october 15, 2018 ms. seema..."
406,0408,Dana McCalley,FL,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,diabetic eye exam measure should be retired. t...
205,0207,Mayo Clinic,0,0,1,0,1,0,0,0,...,0,0,1,1,0,0,0,0,0,see attached file(s) mayo clinic 200 first str...
265,0267,"OneHealth Nebraska ACO, LLC",NE,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,see attached file(s) cms should modify the med...
7,0009,Sherman Jew,WI,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,making and enforcing more complex and expensiv...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,0374,The Queen's Health System,HI,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,please see attached for comments. ms. seema ve...
193,0195,Michael Saito,WI,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,please see the attached document with epic's c...
345,0347,American Association of Nurse Practitioners,VA,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,please find the attached comments of the ameri...
247,0249,High Value Healcare Collaborative,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,see attached file(s) high value healthcare col...


In [12]:
data11.sum(axis=0)[section_cols1]

a2      32
a3      46
a4b      4
a4c     29
a5b     32
a5c     17
a5d     16
a6b     12
a6c     10
a6d2     2
a6d3     7
a7      24
b2a     27
b2b     24
c2      27
c3a     20
c3b     17
d2      39
d3b     26
d3c     16
d3d      3
d4       1
e2      16
e3      10
e4       5
e5      25
e6      20
e7       9
f        2
dtype: object

## Classify One Rule Section)

In [51]:
#stopwords = ['!', '"', "#", "$", "%", "&", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "]", "^", "_", "`", "{", "|", "}", "~", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

### Count Vectorization

In [73]:
bow_vector = CountVectorizer(tokenizer=nltk.RegexpTokenizer(r"\w+").tokenize, ngram_range=(1,1), stop_words='english')

x_train = bow_vector.fit_transform(train_texts)
y_train = np.array(train.a2)

x_test = bow_vector.transform(test_texts)
y_test = np.array(test.a2)

### SVM

In [74]:
svm = SGDClassifier(random_state=44)
svm.fit(X=x_train, y=y_train)
svm_preds = svm.predict(x_test)
svm_preds

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1], dtype=int64)

In [75]:
svm_f1 = f1(y_test, svm_preds)
svm_f1

0.6

In [76]:
svm_confusion = confusion(y_test, svm_preds)
svm_confusion

array([[10,  2],
       [ 6,  6]], dtype=int64)

#### Identifying Most Significant Words for Classification

In [77]:
coefs = svm.coef_[0]
top_fifteen = np.argpartition(coefs, -15)[-15:]
[(bow_vector.get_feature_names()[feature]) for feature in top_fifteen]

['mil',
 'county',
 'telehealth',
 'health',
 'mssp',
 'advocate',
 'shared',
 'acos',
 'e',
 'participants',
 'ama',
 'aurora',
 'spending',
 'texas',
 'savings']

### XGBoost

In [78]:
boost = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
boost.fit(x_train, y_train)
boost_preds = boost.predict(x_test)

In [79]:
boost_f1 = f1(y_test, svm_preds)
boost_f1

0.6

In [80]:
boost_confusion = confusion(y_test, boost_preds)
boost_confusion

array([[9, 3],
       [8, 4]], dtype=int64)

#### Identifying Most Significant Words for Classification

In [81]:
boost_top15 = np.argsort(-boost.feature_importances_)[0:15]
[(bow_vector.get_feature_names()[feature]) for feature in boost_top15] 

['waivers',
 'successful',
 'savings',
 'cause',
 'capital',
 'assignment',
 'behalf',
 'chief',
 'choose',
 'attributed',
 'verma',
 'e',
 'medicare',
 'p',
 '1']

## Classify All Rule Sections - TFIDF

In [82]:
tfidf_vector = TfidfVectorizer(tokenizer=nltk.RegexpTokenizer(r"\w+").tokenize, ngram_range=(1,1), stop_words="english")

x_train = tfidf_vector.fit_transform(train_texts)
y_train = np.array(train[section_cols1])

x_test = tfidf_vector.transform(test_texts)
y_test = np.array(test[section_cols1])

In [None]:
boost = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')

multi_boost = MultiOutputClassifier(boost)

multi_boost.fit(x_train, y_train)

multi_boost_preds = multi_boost.predict(x_test)

In [84]:
multi_boost_f1 = f1(y_test, multi_boost_preds, zero_division=0, average=None)

In [85]:
scores = {sec:score for (sec, score) in zip(section_cols1, list(multi_boost_f1))}
scores

{'a2': 0.6666666666666666,
 'a3': 0.6666666666666666,
 'a4b': 0.0,
 'a4c': 0.75,
 'a5b': 0.6666666666666665,
 'a5c': 0.25,
 'a5d': 0.4444444444444445,
 'a6b': 0.6666666666666666,
 'a6c': 0.0,
 'a6d2': 0.0,
 'a6d3': 0.0,
 'a7': 0.6153846153846154,
 'b2a': 0.6666666666666666,
 'b2b': 0.7692307692307692,
 'c2': 0.5714285714285715,
 'c3a': 0.2,
 'c3b': 0.6666666666666665,
 'd2': 0.6923076923076923,
 'd3b': 0.6666666666666666,
 'd3c': 0.0,
 'd3d': 0.0,
 'd4': 0.0,
 'e2': 0.4444444444444444,
 'e3': 1.0,
 'e4': 0.0,
 'e5': 0.6666666666666666,
 'e6': 0.7692307692307693,
 'e7': 0.0,
 'f': 0.0}

In [86]:
np.mean(multi_boost_f1)

0.4082691255105048

#### Identifying Most Significant Words for Classification

In [87]:
features = []
for boost in multi_boost.estimators_:
    boost_top10 = np.argsort(-boost.feature_importances_)[0:10]
    features.append([(bow_vector.get_feature_names()[feature]) for feature in boost_top10])

  return all_features / all_features.sum()


In [96]:
important_features = {sec:feature_list for (sec, feature_list) in zip(section_cols1, features)}
boost_features = {key:(value,features) for (key, value), (key1, features) in zip(scores.items(), important_features.items())}

%store boost_features
boost_features

Stored 'boost_features' (dict)


{'a2': (0.6666666666666666,
  ['waivers',
   'participants',
   'choose',
   'cause',
   'mechanism',
   'participating',
   'file',
   'success',
   'telehealth',
   'accountable']),
 'a3': (0.6666666666666666,
  ['receive',
   'hospitals',
   'levels',
   'revenue',
   'agreement',
   '1701',
   'shared',
   'recommend',
   'low',
   'seema']),
 'a4b': (0.0,
  ['beneficiary',
   'avoid',
   'cms1701p',
   'election',
   'patient',
   '26',
   'focused',
   'build',
   'date',
   'policy']),
 'a4c': (0.75,
  ['retrospective',
   'prospective',
   'choose',
   'waivers',
   'annually',
   '2',
   'aco',
   'choice',
   'benchmark',
   '1701']),
 'a5b': (0.6666666666666665,
  ['revenue',
   'day',
   'large',
   '21244',
   'low',
   'real',
   'behalf',
   'result',
   'file',
   'attached']),
 'a5c': (0.25,
  ['complex',
   'oppose',
   'percent',
   'dear',
   'determining',
   'entity',
   'regardless',
   'reduction',
   'participation',
   'rule']),
 'a5d': (0.4444444444444445,
  