In [45]:
# Create a pipeline that standardizes the data then creates a model
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# load data

dataframe = read_csv("census.csv")

array = dataframe.values
X=dataframe.loc[:, dataframe.columns != 'income']
y=dataframe['income']
X.head()

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba


In [46]:
# create pipeline
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_features = dataframe.select_dtypes(include=numerics).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = dataframe.loc[:, dataframe.columns != 'income'].select_dtypes(include=object).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', AdaBoostClassifier())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

param_grid = {
        'classifier__n_estimators': [5,100,300,600],
        'classifier__learning_rate': [.1, .4, 2]
    }

search = GridSearchCV(clf, param_grid, n_jobs=-1)





In [None]:
best_clf = search.best_estimator_
best_predictions = best_clf.predict(X_test)
print(f"Accuracy score on testing data: {accuracy_score(y_test, predictions)}")