# Classification Techniques

### Imports

In [3]:
import pandas as pd
import numpy as np
import json
from matplotlib import pyplot as plt
import seaborn as sns
import string
from sklearn.metrics import f1_score as f1, confusion_matrix as confusion, plot_roc_curve as roc
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import xgboost as xgb
from xgboost import plot_importance
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.stem.porter import PorterStemmer

### Read in Comments and True Classifications

In [4]:
with open("unique_comments2018.json") as f:
    texts = json.load(f)

In [5]:
texts = {key:value.replace("\r", " ").replace("\n", " ").replace("\s", " ") for key, value in texts.items()}

In [6]:
for key, value in texts.items():
    texts[key] = ''.join(c for c in value if c in string.printable)

In [7]:
metadata_url = "https://mikeanders.org/data/CMS/CMS-2018-0101-0001/CMS-1701-P%20Comment%20MetaData.csv"
data = pd.read_csv(metadata_url, usecols=range(0,36))[:468] #ignore last few columns and blank rows at end of csv 
data = data.rename(columns=lambda x: x.strip()) #strip whitespace from columns

In [8]:
data10 = data.fillna(0) #fill NaN with 0
section_cols = data10.columns[3:] 
data10[section_cols] = data10[section_cols].replace(["Y"], 1) #replace Y with 1 in approriate columns
data11 = data10.copy()
section_cols1  = data11.columns[3:] 
data11[section_cols1] = np.where((data11[section_cols1]  != 1),0,data11[section_cols1] )

# Combining columns for index matching: (A6b, A6b.1, = A6b),  (C3b, C3b.1'= C3b) ('A7', 'A7.1', 'A7.2', = A7b, a7c),  (F = F2, F3)
data11['A6b'] = (data11['A6b'] + data11['A6b.1'])
data11['A6b'] = data11['A6b'].replace(2,1)
data11['C3b'] = (data11['C3b'] + data11['C3b.1'])
data11['C3b'] = data11['C3b'].replace(2,1)
data11['A7'] = (data11['A7'] + data11['A7.1'] + data11['A7.2'])
data11['A7'] = data11['A7'].replace(2,1)
data11['A7'] = data11['A7'].replace(3,1)
data11 = data11.drop(['A6b.1', 'C3b.1', 'A7.1', 'A7.2'], axis = 1)

data11.Name = [name.split('DRAFT-')[1].split('-')[0] for name in data11.Name]
data11 = data11.rename(columns=lambda x: x.lower())
section_cols1 = data11.columns[3:]

In [9]:
data11 = data11.loc[data11['name'].isin(texts.keys())]
data11["comment"] = texts.values() 

### Train/Test Split

In [18]:
train, test = train_test_split(data11, test_size=0.2, random_state=44)

In [19]:
train.sum(axis=0)[section_cols1]

a2      27
a3      38
a4b      3
a4c     24
a5b     27
a5c     14
a5d     13
a6b      9
a6c      8
a6d2     2
a6d3     6
a7      19
b2a     22
b2b     21
c2      23
c3a     16
c3b     13
d2      34
d3b     23
d3c     16
d3d      3
d4       1
e2      14
e3       6
e4       5
e5      20
e6      16
e7       7
f        2
dtype: object

In [20]:
train_texts = list(train.comment)
test_texts = list(test.comment)

## Classify One Rule Section)

In [21]:
#stopwords = ['!', '"', "#", "$", "%", "&", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "]", "^", "_", "`", "{", "|", "}", "~", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

### Count Vectorization

In [22]:
bow_vector = CountVectorizer(tokenizer=nltk.RegexpTokenizer(r"\w+").tokenize, ngram_range=(1,1), stop_words='english')

x_train = bow_vector.fit_transform(train_texts)
y_train = np.array(train.a2)

x_test = bow_vector.transform(test_texts)
y_test = np.array(test.a2)

### SVM

In [23]:
svm = SGDClassifier(random_state=44)
svm.fit(X=x_train, y=y_train)
svm_preds = svm.predict(x_test)
svm_preds

array([0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0],
      dtype=int64)

In [24]:
svm_f1 = f1(y_test, svm_preds)
svm_f1

0.3636363636363636

In [25]:
svm_confusion = confusion(y_test, svm_preds)
svm_confusion

array([[10,  4],
       [ 3,  2]], dtype=int64)

#### Identifying Most Significant Words for Classification

In [26]:
coefs = svm.coef_[0]
top_fifteen = np.argpartition(coefs, -15)[-15:]
[(bow_vector.get_feature_names()[feature]) for feature in top_fifteen]

['telehealth',
 'phd',
 'success',
 'trend',
 'county',
 'agreement',
 'spending',
 'period',
 'track',
 'uhg',
 'texas',
 'adjustment',
 'acos',
 'risk',
 'regional']

### XGBoost

In [27]:
boost = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
boost.fit(x_train, y_train)
boost_preds = boost.predict(x_test)

In [28]:
boost_f1 = f1(y_test, svm_preds)
boost_f1

0.3636363636363636

In [29]:
boost_confusion = confusion(y_test, boost_preds)
boost_confusion

array([[10,  4],
       [ 2,  3]], dtype=int64)

#### Identifying Most Significant Words for Classification

In [30]:
boost_top15 = np.argsort(-boost.feature_importances_)[0:15]
[(bow_vector.get_feature_names()[feature]) for feature in boost_top15] 

['significant',
 'administrative',
 'choosing',
 'flexibility',
 '8',
 'llc',
 'access',
 'nursing',
 'challenges',
 '1',
 'e',
 'good',
 'primary',
 'aco',
 'attached']

## Classify All Rule Sections - TFIDF

In [38]:
def stem_tokenize(text):
    tokens = word_tokenize(text)
    stems = [PorterStemmer().stem(item) for item in tokens]
    return stems

tfidf_vector = TfidfVectorizer(tokenizer=stem_tokenize, ngram_range=(1,1), max_df=0.9)

x_train = tfidf_vector.fit_transform(train_texts)
y_train = np.array(train[section_cols1])

x_test = tfidf_vector.transform(test_texts)
y_test = np.array(test[section_cols1])

In [39]:
boost = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')

multi_boost = MultiOutputClassifier(boost)

multi_boost.fit(x_train, y_train)

multi_boost_preds = multi_boost.predict(x_test)

In [40]:
multi_boost_f1 = f1(y_test, multi_boost_preds, zero_division=0, average=None)

In [41]:
scores = {sec:score for (sec, score) in zip(section_cols1, list(multi_boost_f1))}
scores

{'a2': 0.6666666666666665,
 'a3': 0.6666666666666666,
 'a4b': 0.0,
 'a4c': 0.5,
 'a5b': 0.888888888888889,
 'a5c': 0.8,
 'a5d': 0.8,
 'a6b': 0.8,
 'a6c': 0.0,
 'a6d2': 0.0,
 'a6d3': 0.0,
 'a7': 0.5714285714285715,
 'b2a': 0.6,
 'b2b': 1.0,
 'c2': 0.5,
 'c3a': 0.6666666666666666,
 'c3b': 0.6666666666666666,
 'd2': 0.8000000000000002,
 'd3b': 0.8571428571428571,
 'd3c': 0.0,
 'd3d': 0.0,
 'd4': 0.0,
 'e2': 0.0,
 'e3': 0.0,
 'e4': 0.0,
 'e5': 0.6,
 'e6': 0.8571428571428571,
 'e7': 0.0,
 'f': 0.0}

In [42]:
np.mean(multi_boost_f1)

0.42211275314723595

#### Identifying Most Significant Words for Classification

In [36]:
features = []
for boost in multi_boost.estimators_:
    boost_top10 = np.argsort(-boost.feature_importances_)[0:10]
    features.append([(tfidf_vector.get_feature_names()[feature]) for feature in boost_top10])

In [37]:
important_features = {sec:feature_list for (sec, feature_list) in zip(section_cols1, features)}
boost_features = {key:(value,features) for (key, value), (key1, features) in zip(scores.items(), important_features.items())}

%store boost_features
boost_features

Stored 'boost_features' (dict)


{'a2': (0.6666666666666665,
  ['these',
   'choos',
   '20201',
   'goal',
   'access',
   'next',
   '4',
   'thi',
   'while',
   '$']),
 'a3': (0.625,
  ['what',
   'score',
   'percent',
   'revenu',
   'ensur',
   'would',
   'oper',
   'depart',
   'box',
   'an']),
 'a4b': (0.0,
  ['cms1701p',
   'decemb',
   'msr/mlr',
   'templat',
   'materi',
   '60',
   'cours',
   'treat',
   'must',
   'invest']),
 'a4c': (0.6,
  ['waiver',
   'promot',
   'expand',
   'less',
   'fewer',
   'permit',
   'behalf',
   'attach',
   'prospect',
   'concern']),
 'a5b': (0.888888888888889,
  ['revenu',
   '#',
   'reason',
   'both',
   'could',
   'encourag',
   'should',
   'and',
   'than',
   'attach']),
 'a5c': (0.8,
  ['outsid',
   'corridor',
   'gener',
   'determin',
   'experi',
   'requir',
   'pleas',
   'renew',
   'and',
   '12']),
 'a5d': (0.6666666666666666,
  ['outsid', 'agre', "''", '10', ':', 'do', 'work', 's', ')', 'control']),
 'a6b': (0.5,
  ['msr/mlr',
   'msr',
   'term