# Classifying Boolean Values

In [201]:
#import additional sklearn functions
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from time import time
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

1. Import data and set X & y

In [202]:
#bring in test data for model to predict
path_to_file = "https://raw.githubusercontent.com/mawebster9/ThesisCode/master/appeals_query.csv"
appeals_data = pd.read_csv(path_to_file, encoding='latin-1')

In [203]:
X = appeals_data.select_dtypes('bool')
y = X['Denied']

In [204]:
X = X.drop('Denied',axis=1)

In [205]:
X.iloc[0]

Smith                     False
Female                    False
Position_Eligibility      False
No_Falsification          False
Rebut_Falsification       False
Falsification(s)           True
Domestic_Violence         False
Previous_Clearance        False
Traumatic_Life_Event      False
Caused_Death              False
Child_Sexual_Abuse        False
Child_Pornography         False
Prostitutes               False
Fmr_Military_LawE         False
Adverse_Affirmed          False
Favorable_Affirmed        False
Granted                   False
Failed_to_Mitigate        False
Success_to_Mitigate       False
Adverse_Reversed          False
Revoked_Fav_Reversed      False
Adverse_Remanded          False
Favorable_Remanded        False
Remanded_wInstructions    False
Recommend_Waiver          False
Decision_Other            False
Decision_Unknown          False
Security_Violations       False
Foreign_Influence         False
Foreign_Preference        False
Sexual_Behavior           False
Personal

2. Change True values to 1.0 and False values to 0.0

In [206]:
X.columns.tolist()
for i in (X.columns.tolist()):
    X[i] = X[i].replace(True,1)

In [207]:
X.head(10)

Unnamed: 0,Smith,Female,Position_Eligibility,No_Falsification,Rebut_Falsification,Falsification(s),Domestic_Violence,Previous_Clearance,Traumatic_Life_Event,Caused_Death,...,Alcohol,Drugs,Emotional_Mental,Criminal_Conduct,Handling_PI,Outside_Activities,Use_InfoSys,Deception,CAC,Unknown_Guideline
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [208]:
X.iloc[0]

Smith                     0.0
Female                    0.0
Position_Eligibility      0.0
No_Falsification          0.0
Rebut_Falsification       0.0
Falsification(s)          1.0
Domestic_Violence         0.0
Previous_Clearance        0.0
Traumatic_Life_Event      0.0
Caused_Death              0.0
Child_Sexual_Abuse        0.0
Child_Pornography         0.0
Prostitutes               0.0
Fmr_Military_LawE         0.0
Adverse_Affirmed          0.0
Favorable_Affirmed        0.0
Granted                   0.0
Failed_to_Mitigate        0.0
Success_to_Mitigate       0.0
Adverse_Reversed          0.0
Revoked_Fav_Reversed      0.0
Adverse_Remanded          0.0
Favorable_Remanded        0.0
Remanded_wInstructions    0.0
Recommend_Waiver          0.0
Decision_Other            0.0
Decision_Unknown          0.0
Security_Violations       0.0
Foreign_Influence         0.0
Foreign_Preference        0.0
Sexual_Behavior           0.0
Personal_Conduct          1.0
Financial                 0.0
Alcohol   

In [209]:
y = y.replace(True,1)

In [210]:
y.head(10)

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
5    0.0
6    1.0
7    0.0
8    1.0
9    1.0
Name: Denied, dtype: float64

In [211]:
y.iloc[0]

1.0

In [212]:
y.value_counts()

1.0    10862
0.0     9652
Name: Denied, dtype: int64

In [213]:
X.shape

(20514, 43)

In [214]:
y.shape

(20514,)

3. Run train_test_split on the current X & y

In [215]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [216]:
X_train.shape

(13744, 43)

In [217]:
X_test.shape

(6770, 43)

4. Run classifiers on current data

In [218]:
classifiers = [RandomForestClassifier(n_estimators=5), GaussianNB(), LogisticRegression(solver='liblinear'), DecisionTreeClassifier(criterion='gini'), KNeighborsClassifier(n_neighbors=6)]
clf_names = ['RandomForest','GausianNB','LogisticRegression','DecisionTreeClassRegressor', 'KNeighbors']
metric_names = ['roc_auc','f1','accuracy','precision','recall']

scv = StratifiedKFold(n_splits=3)

scores_df = pd.DataFrame(index=metric_names,columns=clf_names)
clf_scores = []
for clf, name in zip(classifiers, clf_names):
    print('-------------------------------------------------------------------')
    print('clf: ',clf)
    print('')
    for metric in metric_names:
        score = cross_val_score(clf,X,y,scoring=metric, cv=scv).mean()
        clf_scores.append(score)
        print('\t*',metric,' score: ', score)
    scores_df[name] = clf_scores
    clf_scores = []


-------------------------------------------------------------------
clf:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

	* roc_auc  score:  0.9972964792534058
	* f1  score:  0.9894842046443553
	* accuracy  score:  0.9927364322669961
	* precision  score:  0.9983269686266153
	* recall  score:  0.991530070903157
-------------------------------------------------------------------
clf:  GaussianNB(priors=None, var_smoothing=1e-09)

	* roc_auc  score:  0.8636564070817899
	* f1  score:  0.8907724906494924
	* accuracy  score:  0.8704304545373539
	* precision  score:  0.8046325888620532
	* recall  score:  0.9976065543588328