## Import coade and load census.csv training data

In [28]:
# Create a pipeline that standardizes the data then creates a model
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from pandas import read_csv
import pandas as pd

# load data

dataframe = read_csv("census.csv")

array = dataframe.values
X=dataframe.loc[:, dataframe.columns != 'income']
y=dataframe['income']
X.head()

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba


## Preprocessing steps

In [15]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', AdaBoostClassifier())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

param_grid = {
        'classifier__n_estimators': [5,100,300,600],
        'classifier__learning_rate': [.1, .4, 2]
    }

In [16]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_features = dataframe.select_dtypes(include=numerics).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = dataframe.loc[:, dataframe.columns != 'income'].select_dtypes(include=object).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [10]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier',AdaBoostClassifier())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

search = GridSearchCV(clf, param_grid, n_jobs=-1, cv=5)
grid_fit = search.fit(X_train, y_train)

## Run best model on test data

In [35]:
best_clf = search.best_estimator_
best_predictions = best_clf.predict_proba(X_test)[:, 1]
print(f"test data auc score: {roc_auc_score(y_test, best_predictions)}")

test data auc score: 0.9204045714591476


## Run best model on test_census.csv and export results as submission.csv

In [33]:
test_data = pd.read_csv("test_census.csv")
test_predictions = best_clf.predict_proba(test_data)[:, 1]
test_predictions = pd.DataFrame(test_predictions,columns=['income'])

In [34]:
test_predictions.insert(0,'id',test_predictions.reset_index().index)
test_predictions.head()

Unnamed: 0,id,income
0,0,0.495894
1,1,0.501001
2,2,0.499631
3,3,0.50656
4,4,0.496844


In [None]:
test_predictions.to_csv(r'submission.csv',index=False)