In [132]:
import pandas as pd
import numpy as np
import json
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score as f1, confusion_matrix as confusion, plot_roc_curve as roc
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import xgboost as xgb
from xgboost import plot_importance
from nltk.tokenize.treebank import TreebankWordTokenizer

In [74]:
with open("comments2018.json") as f:
    texts = json.load(f)

In [75]:
metadata_url = "https://mikeanders.org/data/CMS/CMS-2018-0101-0001/CMS-1701-P%20Comment%20MetaData.csv"
data = pd.read_csv(metadata_url, usecols=range(0,36))[:469] #ignore last few columns and blank rows at end of csv 
data = data.rename(columns=lambda x: x.strip()) #strip whitespace from columns

In [76]:
data10 = data.fillna(0) #fill NaN with 0
section_cols = data10.columns[3:] 
data10[section_cols] = data10[section_cols].replace(["Y"], 1) #replace Y with 1 in approriate columns
data11 = data10.copy()
section_cols1  = data11.columns[3:] 
data11[section_cols1] = np.where((data11[section_cols1]  != 1),0,data11[section_cols1] )

# Combining columns for index matching: (A6b, A6b.1, = A6b),  (C3b, C3b.1'= C3b) ('A7', 'A7.1', 'A7.2', = A7b, a7c),  (F = F2, F3)
data11['A6b'] = (data11['A6b'] + data11['A6b.1'])
data11['A6b'] = data11['A6b'].replace(2,1)
data11['C3b'] = (data11['C3b'] + data11['C3b.1'])
data11['C3b'] = data11['C3b'].replace(2,1)
data11['A7'] = (data11['A7'] + data11['A7.1'] + data11['A7.2'])
data11['A7'] = data11['A7'].replace(2,1)
data11['A7'] = data11['A7'].replace(3,1)
data11 = data11.drop(['A6b.1', 'C3b.1', 'A7.1', 'A7.2'], axis = 1)
data11 = data11[0:468]
section_cols1  = data11.columns[3:]

In [77]:
data11["text"] = texts.values()
data11.Name = [name.split('DRAFT-')[1].split('-')[0] for name in data11.Name]
data11 = data11.drop(["Organization Name / Submitter name", "Submitter State"], axis=1)
data11.text = [text.replace("\r", " ").replace("\n", " ").replace("\s", " ") for text in data11.text]

Train/Test Split

In [83]:
train = data11.sample(frac=.75, random_state=44)
test = data11.drop(train.index)

In [84]:
train_texts = list(train.text)
test_texts = list(test.text)

In [85]:
train

Unnamed: 0,Name,A2,A3,A4b,A4c,A5b,A5c,A5d,A6b,A6c,...,D3d,D4,E2,E3,E4,E5,E6,E7,F,text
71,0073,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,"as you know, under the current medicare shared..."
427,0429,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,"seema verma, administrator centers for medica..."
88,0090,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,"as you know, under the current medicare shared..."
303,0305,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,"as you know, under the current medicare shared..."
465,0467,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"see attached file(s)october 15, 2018 ms. seema..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,0128,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,"as you know, under the current medicare shared..."
346,0348,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,attached are comments submitted on behalf of t...
177,0179,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,"as you know, under the current medicare shared..."
167,0169,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,the honorable seema verma administrator cent...


In [177]:
data11.sum(axis=0)

Name    0002000300040005000600070008000900100011001200...
A2                                                    109
A3                                                    153
A4b                                                    19
A4c                                                   109
A5b                                                    86
A5c                                                    50
A5d                                                    38
A6b                                                    21
A6c                                                    20
A6d2                                                    5
A6d3                                                   18
A7                                                     51
B2a                                                   116
B2b                                                    83
C2                                                    119
C3a                                                    95
C3b           

In [149]:
#stopwords = ['!', '"', "#", "$", "%", "&", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "]", "^", "_", "`", "{", "|", "}", "~", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

SVM with BOW Vector

In [183]:
svm = SGDClassifier(random_state=44)
bow_vector = CountVectorizer(tokenizer=TreebankWordTokenizer().tokenize, ngram_range=(1,2), stop_words='english')

x_train = bow_vector.fit_transform(train_texts)
y_train = np.array(train.D2)

x_test = bow_vector.transform(test_texts)
y_test = np.array(test.D2)

svm.fit(X=x_train, y=y_train)
svm_preds = svm.predict(x_test)

In [184]:
svm_preds

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0], dtype=int64)

In [185]:
svm_f1 = f1(y_test, svm_preds)

In [186]:
svm_f1

0.7272727272727272

In [187]:
coefs = np.abs(svm.coef_[0])
top_fifteen = np.argpartition(coefs, -15)[-15:]
[(bow_vector.get_feature_names()[feature]) for feature in top_fifteen] 

['(',
 'percent',
 'information',
 'agreement',
 'premier',
 'benchmark',
 '.',
 ')',
 'hospital',
 'performance',
 'cap',
 'beneficiaries',
 'risk',
 'risk adjustment',
 'texas']