In [56]:
#import additional sklearn functions
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from time import time
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [13]:
#bring in test data for model to predict
path_to_file = "C:\\Users\\maweb\\Documents\\ThesisCode\\appeals_query.csv"
appeals_data = pd.read_csv(path_to_file, encoding='latin-1')

In [26]:
#assign the x_train data
X = appeals_data['Judgment'].head(100)
y = appeals_data['Denied']

In [27]:
X.iloc[0]

"Applicant's drug abuse was not mitigated where marijuana use was recent, and had continued after Applicant stated an intent to refrain from drug use in the future. He falsified his drug abuse history on security questionnaires in March and October 1995 an"

In [28]:
y.iloc[0]

True

In [29]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(X.tolist())


In [30]:
vectorizer.get_feature_names()

['00',
 '000',
 '10',
 '1001',
 '12',
 '13',
 '14',
 '15',
 '18',
 '19',
 '1967',
 '1972',
 '1976',
 '1977',
 '1978',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '20',
 '23',
 '25',
 '26',
 '30',
 '33',
 '35',
 '45',
 '60',
 '80',
 '86',
 '91',
 '94',
 '96',
 'aa',
 'ab',
 'abandonment',
 'able',
 'absence',
 'abstain',
 'abstinence',
 'abstinent',
 'abuse',
 'abused',
 'abuser',
 'abusing',
 'abusive',
 'access',
 'accident',
 'accounts',
 'action',
 'actions',
 'actively',
 'activity',
 'acts',
 'addicted',
 'addiction',
 'addictions',
 'addition',
 'adjudicative',
 'administrative',
 'adverse',
 'affirmed',
 'aftercare',
 'age',
 'ago',
 'alc',
 'alcohol',
 'alcoholism',
 'alford',
 'amd',
 'analysis',
 'anecdotal',
 'annual',
 'annually',
 'ap',
 'appeal',
 'appealing',
 'appeared',
 'applic',
 'applicant',
 'applicants',
 'application',
 'applications',
 'appllicant',
 'apply',
 'a

In [31]:
X[:1]

<1x758 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [33]:
X.shape

(100, 758)

In [37]:
y = y.replace(True,1)

In [38]:
y.value_counts()

1.0    10862
0.0     9652
Name: Denied, dtype: int64

In [39]:
y = y.head(100)

In [40]:
X.shape

(100, 758)

In [42]:
y.shape

(100,)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [45]:
X_train.shape

(67, 758)

In [46]:
X_test.shape

(33, 758)

In [61]:
classifiers = [RandomForestClassifier(n_estimators=5), GaussianNB(), LogisticRegression(solver='liblinear')]
clf_names = ['RandomForest','GausianNB','LogisticRegression']
metric_names = ['roc_auc','f1','accuracy','precision','recall']

scv = StratifiedKFold(n_splits=3)

scores_df = pd.DataFrame(index=metric_names,columns=clf_names)
clf_scores = []
for clf, name in zip(classifiers, clf_names):
    print('clf: ',clf)
    for metric in metric_names:
        score = cross_val_score(clf,X.toarray(),y,scoring=metric, cv=scv).mean()
        clf_scores.append(score)
        print('{} score: {}',metric, score)
    scores_df[name] = clf_scores
    clf_scores = []


clf:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
{} score: {} roc_auc 0.808802308802309
{} score: {} f1 0.8349457229700391
{} score: {} accuracy 0.7598039215686274
{} score: {} precision 0.6875
{} score: {} recall 0.9689754689754689
clf:  GaussianNB(priors=None, var_smoothing=1e-09)
{} score: {} roc_auc 0.7236652236652237
{} score: {} f1 0.842962962962963
{} score: {} accuracy 0.7787990196078431
{} score: {} precision 0.7898550724637681
{} score: {} recall 0.906926406926407
clf:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='

In [60]:
X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.18726624],
       [0.        , 0.        , 0.        , ..., 0.        , 0.18282133,
        0.1713554 ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.3196671 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])