In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('ticks')
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('./cheap_train_sample.csv')
df_test = pd.read_csv('./test_data.csv')


In [3]:
def clean_df(df):
    df= df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    if 'wage' in df:
        df[['wage']]= df[['wage']].replace({'<=50K':0,'>50K':1})
        df_d = df[df['workclass'] != '?']
        df_d = df_d[df_d['occupation'] != '?'] 
        df_d = df_d[df_d['native-country'] != '?'] 
        df_d[['sex']]= df_d[['sex']].replace({'Male':0,'Female':1})
        marriage_dict ={
        'Married-civ-spouse':1,
        'Never-married':3,
        'Divorced':2,
        'Separated':2,
        'Widowed':2,
        'Married-spouse-absent':1,
        'Married-AF-spouse':1,
        }
        df_d['marriage_code']=df_d['marital-status'].map(marriage_dict)
        df_d['married']=df_d['marriage_code'].map(lambda marriage_code:1 if marriage_code ==1 else 0)
        df_d['not_married']=df_d['marriage_code'].map(lambda marriage_code:1 if marriage_code == 2 else 0)
        df_d['never_married']=df_d['marriage_code'].map(lambda marriage_code:1 if marriage_code == 3 else 0)
        conv_dict={"Private": 1, "Self-emp-not-inc":2, "Local-gov": 3,"State-gov":4,"Self-emp-inc": 5,"Federal-gov": 6,
               "Without-pay":7}
        df_d[['workclass']]= df_d[['workclass']].replace(conv_dict)
        dum =pd.get_dummies(df_d['occupation'], drop_first=True)
        df_dd = pd.concat([df_d, dum], axis=1)
    else:
        df[['sex']]= df[['sex']].replace({'Male':0,'Female':1})
        marriage_dict ={
        'Married-civ-spouse':1,
        'Never-married':3,
        'Divorced':2,
        'Separated':2,
        'Widowed':2,
        'Married-spouse-absent':1,
        'Married-AF-spouse':1,
        }
        df['marriage_code']=df['marital-status'].map(marriage_dict)
        df['married']=df['marriage_code'].map(lambda marriage_code:1 if marriage_code ==1 else 0)
        df['not_married']=df['marriage_code'].map(lambda marriage_code:1 if marriage_code == 2 else 0)
        df['never_married']=df['marriage_code'].map(lambda marriage_code:1 if marriage_code == 3 else 0)
        conv_dict={"Private": 1, "Self-emp-not-inc":2, "Local-gov": 3,"State-gov":4,"Self-emp-inc": 5,"Federal-gov": 6,
               "Without-pay":7}
        df[['workclass']]= df[['workclass']].replace(conv_dict)
        dum =pd.get_dummies(df['occupation'], drop_first=True)
        df_dd = pd.concat([df, dum], axis=1)
    return df_dd

In [4]:
def cm_output(model,X_train,y_train,X_test,y_test,y_pred):
    print(f"cvs score =   {(cross_val_score(model, X_train, y_train, cv=5)).mean()}")
    print(f"train score = {model.score(X_train, y_train)}")
    print(f"test score  = {model.score(X_test, y_test)}")
    cm = confusion_matrix(y_test, y_pred)  # matrix of actual y and predicted y
    cm_df = pd.DataFrame(cm, columns=['Predicted N (ineligible)', 'Predicted P (eligible)'],index=['Actual N (ineligible)', 'Actual P (eligible)'])
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print('Confusion Matrix:')
    print(cm_df)
    print(f"Sensitivity = {tp / (tp + fn)}")
    print(f"Specificity = {tn/(tn+fp)}")
    print(f"Accuracy = {(tp+tn)/(tp+tn+fp+fn)}")
    print(f"F1 score = {f1_score(y_test, y_pred, labels=np.unique(y_pred))}")
    print('------------------------------------')
    print()

In [5]:
features = ['age', 'education-num', 'sex', 'hours-per-week', 'fnlwgt', 'married',
            'not_married', 'never_married', 'capital-gain', 'capital-loss','Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing',
       'Handlers-cleaners', 'Machine-op-inspct', 'Other-service',
       'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales',
       'Tech-support', 'Transport-moving']

In [6]:
df_dd = clean_df(df)

In [7]:
df_test_ = clean_df(df_test)

In [8]:
df_test_

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,25,1,226802,11th,7,Never-married,Machine-op-inspct,Own-child,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,1,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,0,0,...,0,0,0,0,0,0,0,1,0,0
2,28,3,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,0,0,...,0,0,0,0,0,0,0,1,0,0
3,44,1,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,0,7688,...,0,0,0,0,0,0,0,1,0,0
4,18,?,103497,Some-college,10,Never-married,?,Own-child,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,1,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,1,0,...,0,0,0,0,0,0,0,1,0,0
16277,64,?,321403,HS-grad,9,Widowed,?,Other-relative,0,0,...,0,0,0,0,0,0,0,1,0,0
16278,38,1,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,0,0,...,0,0,0,0,0,0,0,1,0,0
16279,44,1,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,0,5455,...,0,0,0,0,0,0,0,1,0,0


In [9]:
X = df_dd[features]
y = df_dd['wage']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)


In [11]:
pipe_tree = Pipeline([
                 ('rf', RandomForestClassifier(random_state = 42))
])
rf_params = {
    'rf__n_estimators': [100,150,200],
    'rf__max_depth': [9,10,11,12,13]
}
gs_dr = GridSearchCV(pipe_tree, param_grid=rf_params, cv=5)
gs_dr.fit(X, y)
print(gs_dr.score(X, y))


0.8879496104757169


In [12]:
gs_dr.best_params_

{'rf__max_depth': 13, 'rf__n_estimators': 100}

In [13]:
y_pred = gs_dr.predict(df_test_[features])
#cm_output(gs_dr,X_train,y_train,X_test,y_test,y_pred)

In [14]:
Y_HAT = pd.DataFrame(y_pred,columns=['wage'])

In [15]:
Y_HAT

Unnamed: 0,wage
0,0
1,0
2,0
3,1
4,0
...,...
16276,0
16277,0
16278,1
16279,0


In [16]:
Y_HAT.to_csv("./does_matter.csv")