In [1]:
import numpy as np
import os
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.base import clone, BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, auc

In [2]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
class CustomLabelBinarizer(LabelBinarizer):
    def fit(self, X, y=None):
        return super().fit(X)
    def fit_transform(self, X, y=None):
        return super().fit_transform(X)

In [3]:
# Define the format of your input data including unused columns (These are the columns from the census data files)
COLUMNS = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income-level'
]

Label_Column = 'income-level'

column_set = set(COLUMNS)
column_set.remove(Label_Column)
Feature_Columns = list(column_set)

Categorial_Columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country'
]

category_set = set(Categorial_Columns)
numerical_set = column_set.difference(category_set)

Numerical_Columns = list(numerical_set)

In [4]:
train_data = '/Users/luoshixin/Downloads/data/adult.data.csv'
test_data = '/Users/luoshixin/Downloads/data/adult.test.csv'
raw_training_data = pd.read_csv(train_data, header=None, names=COLUMNS)
raw_test_data = pd.read_csv(test_data, header=None, names=COLUMNS)

In [5]:
raw_training_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-level
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
train_labels = (raw_training_data['income-level'] == ' >50K')
test_labels = (raw_test_data['income-level'] == ' >50K')

In [13]:
feature_pipeline = []

for colunm in Categorial_Columns:
    ds = DataFrameSelector([colunm])
    cb = CustomLabelBinarizer()
    column_cat_pipeline = Pipeline([
        ('selector', ds),
        ('label_binarizer', cb)
    ])
    feature_pipeline.append(('Category_{}'.format(colunm), column_cat_pipeline))

nds = DataFrameSelector(Numerical_Columns)
num_pipeline = Pipeline([
    ('selector', nds),
])

feature_pipeline.append(('num_pipeline', num_pipeline))

features = FeatureUnion(feature_pipeline)

In [14]:
classifier = RandomForestClassifier()

classifier.fit(features.fit_transform(raw_training_data), train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
accuracy_score(classifier.predict(features.transform(raw_test_data)), test_labels)

0.8457427202359012

In [16]:
full_pipeline = Pipeline([
    ('feature', features),
    ('classifier', classifier)
])

In [17]:
joblib.dump(full_pipeline, 'model.joblib')

['model.joblib']

In [None]:
PROJECT_ID = 'woven-rush-197905'
VERSION_NAME = 'v2'
MODEL_NAME = 'scikit'

service = googleapiclient.discovery.build('ml', 'v1')
name = 'projects/{}/models/{}'.format(PROJECT_ID, MODEL_NAME)
name += '/versions/{}'.format(VERSION_NAME)

# Due to the size of the data, it needs to be split in 2
first_half = test_features[:int(len(test_features)/2)]
second_half = test_features[int(len(test_features)/2):]

complete_results = []
for data in [first_half, second_half]:
    responses = service.projects().predict(
        name=name,
        body={'instances': data}
    ).execute()

    if 'error' in responses:
        print(response['error'])
    else:
        complete_results.extend(responses['predictions'])
        
# Print the first 10 responses
for i, response in enumerate(complete_results[:10]):
    print('Prediction: {}\tLabel: {}'.format(response, test_labels[i]))