# Classifying Text Values

In [235]:
#import additional sklearn functions
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from time import time
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

1. Import data and set X & y

In [236]:
#bring in test data for model to predict
path_to_file = "https://raw.githubusercontent.com/mawebster9/ThesisCode/master/appeals_query.csv"
appeals_data = pd.read_csv(path_to_file, encoding='latin-1')

In [237]:
#assign the x_train data
X = appeals_data['Judgment'].head(500)
y = appeals_data['Denied']

In [238]:
X.iloc[0]

"Applicant's drug abuse was not mitigated where marijuana use was recent, and had continued after Applicant stated an intent to refrain from drug use in the future. He falsified his drug abuse history on security questionnaires in March and October 1995 an"

In [239]:
y.iloc[0]

True

2. Set up bag of words for Judgment field

In [240]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(X.tolist())


In [241]:
vectorizer.get_feature_names()

['00',
 '000',
 '10',
 '1001',
 '11',
 '12',
 '13',
 '14',
 '15',
 '154',
 '16',
 '17',
 '18',
 '19',
 '1959',
 '1965',
 '1966',
 '1967',
 '1969',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '199',
 '1990',
 '1990s',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '2001',
 '2003',
 '2005',
 '2006',
 '2007',
 '203',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '33',
 '35',
 '36',
 '38',
 '40',
 '401k',
 '45',
 '50',
 '548',
 '59',
 '60',
 '700',
 '80',
 '81',
 '83',
 '86',
 '88',
 '91',
 '94',
 '96',
 'a10',
 'aa',
 'ab',
 'abandoning',
 'abandonment',
 'abilities',
 'ability',
 'able',
 'absence',
 'absent',
 'absolutely',
 'absolve',
 'absolving',
 'absorb',
 'abstain',
 'abstained',
 'abstention',
 'abstin',
 'abstinence',
 'abstinent',
 'abuse',
 'abused',

In [242]:
X[:1]

<1x2117 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [243]:
X.shape

(500, 2117)

3. Change True values to 1.0 and False values to 0.0

In [244]:
y = y.replace(True,1).head(500)

In [245]:
y.value_counts()

0.0    260
1.0    240
Name: Denied, dtype: int64

In [246]:
X.shape

(500, 2117)

In [247]:
y.shape

(500,)

4. Run train_test_split on the current X & y

In [248]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [249]:
X_train.shape

(335, 2117)

In [250]:
X_test.shape

(165, 2117)

5. Run classifiers on current data

In [251]:
classifiers = [RandomForestClassifier(n_estimators=5), GaussianNB(), LogisticRegression(solver='liblinear'), DecisionTreeClassifier(criterion='gini'), KNeighborsClassifier(n_neighbors=6)]
clf_names = ['RandomForest','GausianNB','LogisticRegression','DecisionTreeClassRegressor', 'KNeighbors']
metric_names = ['roc_auc','f1','accuracy','precision','recall']

scv = StratifiedKFold(n_splits=3)

scores_df = pd.DataFrame(index=metric_names,columns=clf_names)
clf_scores = []
for clf, name in zip(classifiers, clf_names):
    print('-------------------------------------------------------------------')
    print('clf: ',clf)
    print('')
    for metric in metric_names:
        score = cross_val_score(clf,X.toarray(),y,scoring=metric, cv=scv).mean()
        clf_scores.append(score)
        print('\t*',metric,' score: ', score)
    scores_df[name] = clf_scores
    clf_scores = []


-------------------------------------------------------------------
clf:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

	* roc_auc  score:  0.8499880268199234
	* f1  score:  0.749140765556176
	* accuracy  score:  0.7981266382896858
	* precision  score:  0.7799313751405927
	* recall  score:  0.8250000000000001
-------------------------------------------------------------------
clf:  GaussianNB(priors=None, var_smoothing=1e-09)

	* roc_auc  score:  0.7418064465829101
	* f1  score:  0.7447255088625958
	* accuracy  score:  0.7401341894524206
	* precision  score:  0.7199516908212561
	* recall  score:  0.7791666666666667