# Classification Techniques

### Imports

In [30]:
import pandas as pd
import numpy as np
import json
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score as f1, confusion_matrix as confusion, plot_roc_curve as roc
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import xgboost as xgb
from xgboost import plot_importance
from nltk.tokenize.treebank import TreebankWordTokenizer

### Read in Comments and True Classifications

In [2]:
with open("unique_comments2018.json") as f:
    texts = json.load(f)

In [3]:
metadata_url = "https://mikeanders.org/data/CMS/CMS-2018-0101-0001/CMS-1701-P%20Comment%20MetaData.csv"
data = pd.read_csv(metadata_url, usecols=range(0,36))[:469] #ignore last few columns and blank rows at end of csv 
data = data.rename(columns=lambda x: x.strip()) #strip whitespace from columns

In [14]:
data10 = data.fillna(0) #fill NaN with 0
section_cols = data10.columns[3:] 
data10[section_cols] = data10[section_cols].replace(["Y"], 1) #replace Y with 1 in approriate columns
data11 = data10.copy()
section_cols1  = data11.columns[3:] 
data11[section_cols1] = np.where((data11[section_cols1]  != 1),0,data11[section_cols1] )

# Combining columns for index matching: (A6b, A6b.1, = A6b),  (C3b, C3b.1'= C3b) ('A7', 'A7.1', 'A7.2', = A7b, a7c),  (F = F2, F3)
data11['A6b'] = (data11['A6b'] + data11['A6b.1'])
data11['A6b'] = data11['A6b'].replace(2,1)
data11['C3b'] = (data11['C3b'] + data11['C3b.1'])
data11['C3b'] = data11['C3b'].replace(2,1)
data11['A7'] = (data11['A7'] + data11['A7.1'] + data11['A7.2'])
data11['A7'] = data11['A7'].replace(2,1)
data11['A7'] = data11['A7'].replace(3,1)
data11 = data11.drop(['A6b.1', 'C3b.1', 'A7.1', 'A7.2'], axis = 1)
data11 = data11[0:468]
section_cols1  = data11.columns[3:]

data11.Name = [name.split('DRAFT-')[1].split('-')[0] for name in data11.Name]

In [17]:
data11 = data11.loc[data11['Name'].isin(texts.keys())]
data11["Comment"] = texts.values() 

### Train/Test Split

In [19]:
train = data11.sample(frac=.75, random_state=44)
test = data11.drop(train.index)

In [21]:
train_texts = list(train.Comment)
test_texts = list(test.Comment)

In [22]:
train

Unnamed: 0,Name,Organization Name / Submitter name,Submitter State,A2,A3,A4b,A4c,A5b,A5c,A5d,...,D3d,D4,E2,E3,E4,E5,E6,E7,F,Comment
269,0271,Tri-County Health Care,MN,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,see attached file(s) tri-coun health care clin...
7,0009,Sherman Jew,WI,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,making and enforcing more complex and expensiv...
417,0419,PhRMA,DC,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,see attached file(s) particularly as they rela...
174,0176,John Ulmer,VA,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,see attached file(s)the honorable seema verma ...
124,0126,Karen Hendren,OK,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,value based care and population health strateg...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,0258,Memorial Hermann,TX,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"please see attached file. mom october 15, 2018..."
334,0336,Steward Health Care System LLC,MA,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,please see attached letter. steward health car...
8,0010,Tyler Downing,CO,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,- our rural acos are high-quality performers a...
265,0267,"OneHealth Nebraska ACO, LLC",NE,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,see attached file(s) cms should modify the med...


In [64]:
data11.sum(axis=0)

Name                                  0002000300040005000600070008000900100011001200...
Organization Name / Submitter name    Erick MeleherMayank ShahMayank ShahMorey Menac...
A2                                                                                   30
A3                                                                                   43
A4b                                                                                   4
A4c                                                                                  27
A5b                                                                                  29
A5c                                                                                  17
A5d                                                                                  15
A6b                                                                                  12
A6c                                                                                  10
A6d2                            

### Classify (One Rule Section)

In [44]:
#stopwords = ['!', '"', "#", "$", "%", "&", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "]", "^", "_", "`", "{", "|", "}", "~", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

### BOW Vectorization

In [50]:
bow_vector = CountVectorizer(tokenizer=nltk.RegexpTokenizer(r"\w+").tokenize, ngram_range=(1,2), stop_words="english")

x_train = bow_vector.fit_transform(train_texts)
y_train = np.array(train.D2)

x_test = bow_vector.transform(test_texts)
y_test = np.array(test.D2)

### SVM

In [51]:
svm = SGDClassifier(random_state=44)
svm.fit(X=x_train, y=y_train)
svm_preds = svm.predict(x_test)
svm_preds

array([1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1],
      dtype=int64)

In [52]:
svm_f1 = f1(y_test, svm_preds)
svm_f1

0.8275862068965518

In [48]:
svm_confusion = confusion(y_test, svm_preds)
svm_confusion

array([[ 4,  3],
       [ 2, 12]], dtype=int64)

#### Identifying Most Significant Words for Classification

In [35]:
coefs = np.abs(svm.coef_[0])
top_fifteen = np.argpartition(coefs, -15)[-15:]
[(bow_vector.get_feature_names()[feature]) for feature in top_fifteen] 

['e',
 'comments',
 'agreement',
 'services',
 'ama',
 'adherence',
 '2',
 'pharmacy',
 'ï',
 'savings',
 'mssp',
 'proposed',
 'cms',
 'spending',
 'medication']

### XGBoost

In [58]:
boost = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
boost.fit(x_train, y_train)
boost_preds = boost.predict(x_test)

In [59]:
boost_f1 = f1(y_test, svm_preds)
boost_f1

0.8275862068965518

In [60]:
boost_confusion = confusion(y_test, boost_preds)
boost_confusion

array([[ 7,  0],
       [ 3, 11]], dtype=int64)

#### Identifying Most Significant Words for Classification

In [61]:
boost_top15 = np.argsort(-boost.feature_importances_)[0:15]
[(bow_vector.get_feature_names()[feature]) for feature in boost_top15] 

['agreement periods',
 'encourage',
 'd',
 'levels',
 'www',
 'benefit',
 'letter',
 'adjustment',
 'performance years',
 'cms 1701',
 'revenue',
 'low',
 '1701',
 'department',
 '1701 p']