# Classification Techniques

### Imports

In [30]:
import pandas as pd
import numpy as np
import json
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score as f1, confusion_matrix as confusion, plot_roc_curve as roc
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import xgboost as xgb
from xgboost import plot_importance
from nltk.tokenize.treebank import TreebankWordTokenizer

### Read in Comments and True Classifications

In [2]:
with open("unique_comments2018.json") as f:
    texts = json.load(f)

In [115]:
metadata_url = "https://mikeanders.org/data/CMS/CMS-2018-0101-0001/CMS-1701-P%20Comment%20MetaData.csv"
data = pd.read_csv(metadata_url, usecols=range(0,36))[:469] #ignore last few columns and blank rows at end of csv 
data = data.rename(columns=lambda x: x.strip()) #strip whitespace from columns

In [116]:
data10 = data.fillna(0) #fill NaN with 0
section_cols = data10.columns[3:] 
data10[section_cols] = data10[section_cols].replace(["Y"], 1) #replace Y with 1 in approriate columns
data11 = data10.copy()
section_cols1  = data11.columns[3:] 
data11[section_cols1] = np.where((data11[section_cols1]  != 1),0,data11[section_cols1] )

# Combining columns for index matching: (A6b, A6b.1, = A6b),  (C3b, C3b.1'= C3b) ('A7', 'A7.1', 'A7.2', = A7b, a7c),  (F = F2, F3)
data11['A6b'] = (data11['A6b'] + data11['A6b.1'])
data11['A6b'] = data11['A6b'].replace(2,1)
data11['C3b'] = (data11['C3b'] + data11['C3b.1'])
data11['C3b'] = data11['C3b'].replace(2,1)
data11['A7'] = (data11['A7'] + data11['A7.1'] + data11['A7.2'])
data11['A7'] = data11['A7'].replace(2,1)
data11['A7'] = data11['A7'].replace(3,1)
data11 = data11.drop(['A6b.1', 'C3b.1', 'A7.1', 'A7.2'], axis = 1)
data11 = data11[0:468]

data11.Name = [name.split('DRAFT-')[1].split('-')[0] for name in data11.Name]
data11 = data11.rename(columns=lambda x: x.lower())
section_cols1 = data11.columns[3:]

In [117]:
data11 = data11.loc[data11['name'].isin(texts.keys())]
data11["comment"] = texts.values() 

### Train/Test Split

In [118]:
train = data11.sample(frac=.75, random_state=44)
test = data11.drop(train.index)

In [119]:
train_texts = list(train.comment)
test_texts = list(test.comment)

In [120]:
train

Unnamed: 0,name,organization name / submitter name,submitter state,a2,a3,a4b,a4c,a5b,a5c,a5d,...,d3d,d4,e2,e3,e4,e5,e6,e7,f,comment
269,0271,Tri-County Health Care,MN,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,see attached file(s) tri-coun health care clin...
7,0009,Sherman Jew,WI,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,making and enforcing more complex and expensiv...
417,0419,PhRMA,DC,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,see attached file(s) particularly as they rela...
174,0176,John Ulmer,VA,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,see attached file(s)the honorable seema verma ...
124,0126,Karen Hendren,OK,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,value based care and population health strateg...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,0258,Memorial Hermann,TX,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"please see attached file. mom october 15, 2018..."
334,0336,Steward Health Care System LLC,MA,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,please see attached letter. steward health car...
8,0010,Tyler Downing,CO,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,- our rural acos are high-quality performers a...
265,0267,"OneHealth Nebraska ACO, LLC",NE,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,see attached file(s) cms should modify the med...


In [121]:
data11.sum(axis=0)[section_cols1]

a2      30
a3      43
a4b      4
a4c     27
a5b     29
a5c     17
a5d     15
a6b     12
a6c     10
a6d2     2
a6d3     8
a7      21
b2a     23
b2b     20
c2      25
c3a     19
c3b     16
d2      35
d3b     24
d3c     15
d3d      3
d4       1
e2      16
e3      11
e4       6
e5      23
e6      18
e7       9
f        2
dtype: object

## Classify One Rule Section)

In [44]:
#stopwords = ['!', '"', "#", "$", "%", "&", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "]", "^", "_", "`", "{", "|", "}", "~", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

### BOW Vectorization

In [122]:
bow_vector = CountVectorizer(tokenizer=nltk.RegexpTokenizer(r"\w+").tokenize, ngram_range=(1,2), stop_words="english")

x_train = bow_vector.fit_transform(train_texts)
y_train = np.array(train.a2)

x_test = bow_vector.transform(test_texts)
y_test = np.array(test.a2)

### SVM

In [123]:
svm = SGDClassifier(random_state=44)
svm.fit(X=x_train, y=y_train)
svm_preds = svm.predict(x_test)
svm_preds

array([1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
      dtype=int64)

In [124]:
svm_f1 = f1(y_test, svm_preds)
svm_f1

0.7272727272727273

In [125]:
svm_confusion = confusion(y_test, svm_preds)
svm_confusion

array([[7, 3],
       [3, 8]], dtype=int64)

#### Identifying Most Significant Words for Classification

In [126]:
coefs = np.abs(svm.coef_[0])
top_fifteen = np.argpartition(coefs, -15)[-15:]
[(bow_vector.get_feature_names()[feature]) for feature in top_fifteen] 

['shared',
 'agreement',
 'mssp',
 'adherence',
 'spending',
 'medication',
 'rural',
 'care',
 'organizations',
 'ama',
 'pharmacy',
 'program',
 'ï',
 'shared savings',
 'savings']

### XGBoost

In [127]:
boost = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
boost.fit(x_train, y_train)
boost_preds = boost.predict(x_test)

In [128]:
boost_f1 = f1(y_test, svm_preds)
boost_f1

0.7272727272727273

In [129]:
boost_confusion = confusion(y_test, boost_preds)
boost_confusion

array([[6, 4],
       [2, 9]], dtype=int64)

#### Identifying Most Significant Words for Classification

In [130]:
boost_top15 = np.argsort(-boost.feature_importances_)[0:15]
[(bow_vector.get_feature_names()[feature]) for feature in boost_top15] 

['benchmarking',
 'https www',
 'choose',
 'voluntary',
 'flexibility',
 'primary',
 'g',
 'health human',
 'attached',
 'aco',
 'p',
 'clinical',
 'performance',
 'thank',
 'care']

## Classify All Rule Sections

In [131]:
bow_vector = CountVectorizer(tokenizer=nltk.RegexpTokenizer(r"\w+").tokenize, ngram_range=(1,2), stop_words="english")

x_train = bow_vector.fit_transform(train_texts)
y_train = np.array(train[section_cols1])

x_test = bow_vector.transform(test_texts)
y_test = np.array(test[section_cols1])

In [132]:
boost = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')

multi_boost = MultiOutputClassifier(boost)

multi_boost.fit(x_train, y_train)

multi_boost_preds = multi_boost.predict(x_test)

multi_boost_preds

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [133]:
multi_boost_f1 = f1(y_test, multi_boost_preds, zero_division=0, average=None)

In [134]:
{sec:score for (sec, score) in zip(section_cols1, list(multi_boost_f1))}

{'a2': 0.7500000000000001,
 'a3': 0.8461538461538461,
 'a4b': 0.0,
 'a4c': 0.7777777777777777,
 'a5b': 0.631578947368421,
 'a5c': 0.2222222222222222,
 'a5d': 0.6153846153846153,
 'a6b': 0.25,
 'a6c': 0.0,
 'a6d2': 0.0,
 'a6d3': 0.0,
 'a7': 0.19999999999999998,
 'b2a': 0.7142857142857143,
 'b2b': 0.8571428571428571,
 'c2': 0.6666666666666666,
 'c3a': 0.18181818181818182,
 'c3b': 0.6666666666666666,
 'd2': 0.88,
 'd3b': 0.7999999999999999,
 'd3c': 0.6666666666666666,
 'd3d': 0.0,
 'd4': 0.0,
 'e2': 0.3333333333333333,
 'e3': 0.0,
 'e4': 0.0,
 'e5': 0.823529411764706,
 'e6': 0.7272727272727272,
 'e7': 0.5,
 'f': 0.0}

In [135]:
np.mean(multi_boost_f1)

0.41760343567325525

#### Identifying Most Significant Words for Classification

In [136]:
features = []
for boost in multi_boost.estimators_:
    boost_top10 = np.argsort(-boost.feature_importances_)[0:10]
    features.append([(bow_vector.get_feature_names()[feature]) for feature in boost_top10])

  return all_features / all_features.sum()


In [137]:
{sec:feature_list for (sec, feature_list) in zip(section_cols1, features)}

{'a2': ['benchmarking',
  'https www',
  'choose',
  'voluntary',
  'flexibility',
  'primary',
  'g',
  'health human',
  'attached',
  'aco'],
 'a3': ['methodology',
  'hospitals',
  'losses',
  'new',
  'time',
  'revenue',
  'ms',
  'low revenue',
  '2018',
  'attached file'],
 'a4b': ['lower',
  '25',
  '16',
  'electronic',
  'org',
  'assigned',
  'taking',
  'clinical',
  '05',
  '000'],
 'a4c': ['retrospective',
  'benchmarking',
  'quality',
  'assignment',
  'application',
  '8013',
  'participants',
  '1701',
  'lower',
  'believe'],
 'a5b': ['low revenue',
  'fewer',
  '425',
  'entities',
  'good',
  'accountable care',
  'shared',
  'currently',
  'revenue',
  'care'],
 'a5c': ['certified',
  'determining',
  'believe acos',
  'www',
  'experienced',
  'affairs',
  'compared',
  'aco',
  '2018 ms',
  'success'],
 'a5d': ['outside',
  'proposal',
  'negative',
  'calculation',
  'health record',
  'infrastructure',
  'spending',
  'control',
  'complete',
  'benchmarks'],