In [12]:
import requests
r = requests.get('http://localhost:8000/')
r.text

'Welcome to the Adult Income Prediction API'

In [36]:
import requests

sample_dict = {     
                    "workclass": "state_gov",
                    "education": "bachelors",
                    "marital_status": "never_married",
                    "occupation": "adm_clerical",
                    "relationship": "not_in_family",
                    "race": "white",
                    "sex": "male",
                    "native_country": "united_states",
                    "age": 39,
                    "fnlwgt": 77516,
                    "education_num": 13,
                    "capital_gain": 10000,
                    "capital_loss": 0,
                    "hours_per_week": 40
                }
response = requests.post('http://localhost:8000/predict', json=sample_dict)
print(response)
print(response.text)

<Response [200]>
The predicted income is: >50k


In [37]:
import requests

sample_dict = {
                    "workclass": "string",
                    "education": "string",
                    "marital_status": "string",
                    "occupation": "string",
                    "relationship": "string",
                    "race": "string",
                    "sex": "string",
                    "native_country": "string",
                    "age": 10,
                    "fnlwgt": 10,
                    "education_num": 10,
                    "capital_gain": 10,
                    "capital_loss": 10,
                    "hours_per_week": 10
                }
response = requests.post('http://localhost:8000/predict', json=sample_dict)
print(response)
print(response.text)

<Response [422]>
Please enter all the data


In [31]:
(list(sample_dict.values()))

['string',
 'string',
 'string',
 'string',
 'string',
 'string',
 'string',
 'string',
 10,
 10,
 10,
 10,
 10,
 10]

In [33]:
any(list(sample_dict.values())) == "string"

False

In [2]:
from operator import mod
import pandas as pd
import logging
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
import pickle

#logging:
logging.basicConfig(level=logging.INFO)


def load_data(data_path):
    '''
    Load data from data_path
    '''
    #extracting columns from adult.names
    cols = []
    logging.info(f'Extracting columns from adult.names')
    with open(f'{data_path}/adult.names', 'r') as names:
        for line in names:
            if ':' in line and '|' not in line:
                cols.append(line.split(':')[0])
    logging.info(f'Columns: {cols}')

    #load train data:
    df_train = pd.read_csv(f'{data_path}/adult.data', names=cols+['earn_over_50k'], index_col=False)
    logging.info(f'Train data shape: {df_train.shape}')
    #load test data:
    df_test = pd.read_csv(f'{data_path}/adult.test', names=cols+['earn_over_50k'], index_col=False, skiprows=[0])
    logging.info(f'Test data shape: {df_test.shape}')

    #convert train target to 0/1:
    df_train['earn_over_50k'] = df_train['earn_over_50k'].apply(lambda x: 1 if x == ' >50K' else 0)
    logging.info(f'Train target counts: {df_train["earn_over_50k"].value_counts()}')
    #convert test target to 0/1:
    df_test['earn_over_50k'] = df_test['earn_over_50k'].apply(lambda x: 1 if x == ' >50K.' else 0)
    logging.info(f'Test target counts: {df_test["earn_over_50k"].value_counts()}')

    return df_train, df_test
df_train, df_test = load_data('./data')







INFO:root:Extracting columns from adult.names
INFO:root:Columns: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
INFO:root:Train data shape: (32561, 15)
INFO:root:Test data shape: (16281, 15)
INFO:root:Train target counts: 0    24720
1     7841
Name: earn_over_50k, dtype: int64
INFO:root:Test target counts: 0    12435
1     3846
Name: earn_over_50k, dtype: int64


In [5]:
def basic_cleaning(df):
    '''
    Basic cleaning of data-
    '''
    #fixing column names:
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_')
    logging.info(f'Columns: {df.columns}')

    #filter categorical columns and numerical columns:
    cat_cols = df.select_dtypes(include='object').columns.tolist()
    num_cols = df.select_dtypes(exclude='object').columns.tolist()
    logging.info(f'Categorical columns: {cat_cols}')
    logging.info(f'Numerical columns: {num_cols}')
    
     #replacing spaces & - with underscore in categorical columns:
    for col in cat_cols:
        df[col] = df[col].str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_')

    #replacing ? with nan:
    logging.info(f'Before replacing ? with nan: {df.isin(["?"]).sum()}')
    df = df.replace('?', pd.np.nan)
    logging.info(f'Nan values: {df.isna().sum()}')

    #fill nan with mode for categorical columns:
    for col in cat_cols:
        if col != 'earn_over_50k':
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            cat_cols.remove(col)

    #fill nan with mean for numerical columns:
    for col in num_cols:
        if col != 'earn_over_50k':
            df[col] = df[col].fillna(df[col].mean())
        else:
            num_cols.remove(col)

    return df, cat_cols + num_cols

train, cols = basic_cleaning(df_train)



INFO:root:Columns: Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'earn_over_50k'],
      dtype='object')
INFO:root:Categorical columns: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
INFO:root:Numerical columns: ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week', 'earn_over_50k']
INFO:root:Before replacing ? with nan: age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
earn_over_50k        0
dtype: int64
  df = df.replace('?', pd.np.nan)
INFO:root:Nan values:

In [26]:
train.isnull().sum().sum()

0

In [12]:
train.shape

(32561, 15)

In [14]:
def train_model(df_train, cols):
    '''
    Trains model on train data
    '''
    logging.info(f'Training model on Columns: {cols}')
    
    #Initialize KFold:
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    fold = 0
    for train_index, val_index in kf.split(df_train):
        logging.info(f'Fold: {fold}')

        #create train dict:
        train_dict = df_train[cols].iloc[train_index].to_dict(orient='records')
        
        #create val dict:
        val_dict = df_train[cols].iloc[val_index].to_dict(orient='records')
        
        #create train target:
        train_target = df_train['earn_over_50k'].iloc[train_index]

        #create val target:
        val_target = df_train['earn_over_50k'].iloc[val_index]

        #initialize DictVectorizer:
        dv = DictVectorizer(sparse=False)

        #fit DictVectorizer on train dict:
        dv.fit(train_dict)

        #transform train dict:
        X_train = dv.transform(train_dict)

        #transform val dict:
        X_val = dv.transform(val_dict)

        #initialize & fit LogisticRegression:
        model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
        model.fit(X_train, train_target)

        #predict on val data:
        y_pred = model.predict_proba(X_val)[:, 1]

        #calculate roc_auc_score:
        score = roc_auc_score(val_target, y_pred)
        scores.append(score)

        logging.info(f'Fold: {fold}, Score: {score}')
        fold += 1

    logging.info(f'Mean Score: {np.mean(scores)}')

    return dv, model

dv, model = train_model(train, cols)
        


INFO:root:Training model on Columns: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
INFO:root:Fold: 0
INFO:root:Fold: 0, Score: 0.6054567805126353
INFO:root:Fold: 1
INFO:root:Fold: 1, Score: 0.5685005439845257
INFO:root:Fold: 2
INFO:root:Fold: 2, Score: 0.5731820109718645
INFO:root:Fold: 3
INFO:root:Fold: 3, Score: 0.6043548070771081
INFO:root:Fold: 4
INFO:root:Fold: 4, Score: 0.58933937652731
INFO:root:Mean Score: 0.5881667038146887


In [22]:




dv = DictVectorizer()
train_dict = train[cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
y_train = train.earn_over_50k.values

model = LogisticRegression()
model.fit(X_train, y_train)

In [32]:

def predict_single(sample_dict, dv, model):
    '''
    Predicts on single row of test data
    Sample input:
    {   'workclass': 'state_gov',
        'education': 'bachelors',
        'marital_status': 'never_married',
        'occupation': 'adm_clerical',
        'relationship': 'not_in_family',
        'race': 'white',
        'sex': 'male',
        'native_country': 'united_states',
        'age': 39,
        'fnlwgt': 77516,
        'education_num': 13,
        'capital_gain': 2174,
        'capital_loss': 0,
        'hours_per_week': 40
    }
    '''

    #transform test dict:
    X_test = dv.transform(sample_dict)

    #predict on test data:
    y_pred = model.predict_proba(X_test)[:, 1]

    return y_pred[0]

sample_dict = {     'workclass': 'state_gov',
                    'education': 'bachelors',
                    'marital_status': 'never_married',
                    'occupation': 'adm_clerical',
                    'relationship': 'not_in_family',
                    'race': 'white',
                    'sex': 'male',
                    'native_country': 'united_states',
                    'age': 39,
                    'fnlwgt': 77516,
                    'education_num': 13,
                    'capital_gain': 2174,
                    'capital_loss': 0,
                    'hours_per_week': 40
                }

predict_single(sample_dict, dv, model)

0.5478402179858726

In [40]:
test, cols = basic_cleaning(df_test)

def predict_batch(df_test, dv, model, cols):
    '''
    Predicts on test data
    '''
    #create test dict:
    test_dict = df_test[cols].to_dict(orient='records')

    #transform test dict:
    X_test = dv.transform(test_dict)
    X_test[0]

    #predict on test data:
    y_pred = model.predict_proba(X_test)[:, 1]

    return y_pred

y_pred = predict_batch(test, dv, model, cols)


INFO:root:Columns: Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'earn_over_50k'],
      dtype='object')
INFO:root:Categorical columns: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
INFO:root:Numerical columns: ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week', 'earn_over_50k']
INFO:root:Before replacing ? with nan: age                 0
workclass         963
fnlwgt              0
education           0
education_num       0
marital_status      0
occupation        966
relationship        0
race                0
sex                 0
capital_gain        0
capital_loss        0
hours_per_week      0
native_country    274
earn_over_50k       0
dtype: int64
  df = df.replace('?', pd.np.nan)
INFO:root:Nan values: age           

In [41]:
logging.info(f'Score on test data: {roc_auc_score(df_test.earn_over_50k, y_pred)}')


INFO:root:Score on test data: 0.6196447109995377


In [33]:
df_test   

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,earn_over_50k
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,0
16277,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,0
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,0
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,0


In [None]:
def save_model(dv, model, model_path):
    '''
    Save model to model_path
    '''
    #save DictVectorizer:
    with open(f'{model_path}/dv.bin', 'wb') as f_out:
        pickle.dump(dv, f_out)
        f_out.close()
    logging.info(f'DictVectorizer saved')

    #save model:
    with open(f'{model_path}/model1.bin', 'wb') as f_out:
        pickle.dump(model, f_out)
        f_out.close()
    logging.info(f'Model saved')

def predict(df_test, dv, model, cols):
    '''
    Predicts on test data
    '''
    #create test dict:
    test_dict = df_test[cols].to_dict(orient='records')

    #transform test dict:
    X_test = dv.transform(test_dict)

    #predict on test data:
    y_pred = model.predict_proba(X_test)[:, 1]

    return y_pred

if __name__ == '__main__':
    #load data:
    df_train, df_test = load_data('./data')
    #clean data:
    df_train, cols = basic_cleaning(df_train)
    df_test, _ = basic_cleaning(df_test)
    logging.info(f'Train data shape: {df_train.shape}')

    #train model:
    dv, model = train_model(df_train, cols)

    #predict on test data:
    y_pred = predict(df_test, dv, model, cols)
    
    #save predictions:
    df_test['earn_over_50k'] = y_pred
    df_test[['earn_over_50k']].to_csv('./data/predictions.csv', index=False)
    logging.info(f'Predictions saved')

    #save model:
    save_model(dv, model, './model')

    logging.info('Done')

{'prediction_proba': 0.5279585459636567,
 'prediction': 1,
 'message': 'Income <=50K'}