In [1]:
#import additional sklearn functions
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from time import time
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [2]:
#bring in test data for model to predict
path_to_file = "C:\\Users\\maweb\\Documents\\ThesisCode\\appeals_query.csv"
appeals_data = pd.read_csv(path_to_file, encoding='latin-1')

In [6]:
X = appeals_data.select_dtypes('bool')
y = X['Denied']

In [8]:
X = X.drop('Denied',axis=1)

In [9]:
X.iloc[0]

Smith                     False
Female                    False
Position_Eligibility      False
No_Falsification          False
Rebut_Falsification       False
Falsification(s)           True
Domestic_Violence         False
Previous_Clearance        False
Traumatic_Life_Event      False
Caused_Death              False
Child_Sexual_Abuse        False
Child_Pornography         False
Prostitutes               False
Fmr_Military_LawE         False
Adverse_Affirmed          False
Favorable_Affirmed        False
Granted                   False
Failed_to_Mitigate        False
Success_to_Mitigate       False
Adverse_Reversed          False
Revoked_Fav_Reversed      False
Adverse_Remanded          False
Favorable_Remanded        False
Remanded_wInstructions    False
Recommend_Waiver          False
Decision_Other            False
Decision_Unknown          False
Security_Violations       False
Foreign_Influence         False
Foreign_Preference        False
Sexual_Behavior           False
Personal

In [10]:
y.iloc[0]

True

In [16]:
y = y.replace(True,1).head(100)

In [17]:
y.value_counts()

1.0    65
0.0    35
Name: Denied, dtype: int64

In [19]:
X.replace(to_replace=True,value=1)

Unnamed: 0,Smith,Female,Position_Eligibility,No_Falsification,Rebut_Falsification,Falsification(s),Domestic_Violence,Previous_Clearance,Traumatic_Life_Event,Caused_Death,...,Alcohol,Drugs,Emotional_Mental,Criminal_Conduct,Handling_PI,Outside_Activities,Use_InfoSys,Deception,CAC,Unknown_Guideline
0,False,False,False,False,False,True,False,False,False,False,...,False,True,False,True,False,False,False,False,False,False
1,False,True,False,True,False,False,False,False,False,False,...,False,True,False,True,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,...,True,True,False,True,False,False,False,False,False,False
5,False,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,True,False,False,False,False,...,False,True,False,True,False,False,False,False,False,False
7,False,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
8,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
9,False,False,False,False,False,True,False,True,False,False,...,False,True,False,True,False,False,False,False,False,False


In [31]:
X[:1]

<1x758 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [33]:
X.shape

(100, 758)

In [40]:
X.shape

(100, 758)

In [42]:
y.shape

(100,)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [45]:
X_train.shape

(67, 758)

In [46]:
X_test.shape

(33, 758)

In [61]:
classifiers = [RandomForestClassifier(n_estimators=5), GaussianNB(), LogisticRegression(solver='liblinear')]
clf_names = ['RandomForest','GausianNB','LogisticRegression']
metric_names = ['roc_auc','f1','accuracy','precision','recall']

scv = StratifiedKFold(n_splits=3)

scores_df = pd.DataFrame(index=metric_names,columns=clf_names)
clf_scores = []
for clf, name in zip(classifiers, clf_names):
    print('clf: ',clf)
    for metric in metric_names:
        score = cross_val_score(clf,X.toarray(),y,scoring=metric, cv=scv).mean()
        clf_scores.append(score)
        print('{} score: {}',metric, score)
    scores_df[name] = clf_scores
    clf_scores = []


clf:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
{} score: {} roc_auc 0.808802308802309
{} score: {} f1 0.8349457229700391
{} score: {} accuracy 0.7598039215686274
{} score: {} precision 0.6875
{} score: {} recall 0.9689754689754689
clf:  GaussianNB(priors=None, var_smoothing=1e-09)
{} score: {} roc_auc 0.7236652236652237
{} score: {} f1 0.842962962962963
{} score: {} accuracy 0.7787990196078431
{} score: {} precision 0.7898550724637681
{} score: {} recall 0.906926406926407
clf:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='

In [60]:
X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.18726624],
       [0.        , 0.        , 0.        , ..., 0.        , 0.18282133,
        0.1713554 ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.3196671 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])