In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# Viz
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Model Storage
from sklearn.externals import joblib

# Metrics
from sklearn.metrics import mean_squared_error

# Model Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

# NN
import keras as k
from keras.models import Sequential
from keras.layers import Dense, Activation

##
from nn import NeuralNetwork
import nn 

Using TensorFlow backend.


In [2]:
train_data = pd.read_csv("/data/project2/train.csv")
test_data = pd.read_csv("/data/project2/test.csv")
print("Train Data Rows: ", len(train_data))
print("Test Data Rows: ", len(test_data))

Train Data Rows:  33235
Test Data Rows:  8309


In [3]:
train_data.head(2)

Unnamed: 0,Index,Region,Total Food Expenditure,Main Source of Income,Agricultural Household indicator,Bread and Cereals Expenditure,Total Rice Expenditure,Meat Expenditure,Total Fish and marine products Expenditure,Fruit Expenditure,Vegetables Expenditure,Restaurant and hotels Expenditure,Alcoholic Beverages Expenditure,Tobacco Expenditure,"Clothing, Footwear and Other Wear Expenditure",Housing and water Expenditure,Imputed House Rental Value,Medical Care Expenditure,Transportation Expenditure,Communication Expenditure,Education Expenditure,Miscellaneous Goods and Services Expenditure,Special Occasions Expenditure,Crop Farming and Gardening expenses,Total Income from Entrepreneurial Acitivites,Household Head Sex,Household Head Age,Household Head Marital Status,Household Head Highest Grade Completed,Household Head Job or Business Indicator,Household Head Occupation,Household Head Class of Worker,Type of Household,Total Number of Family members,Members with age less than 5 year old,Members with age 5 - 17 years old,Total number of family members employed,Type of Building/House,Type of Roof,Type of Walls,House Floor Area,House Age,Number of bedrooms,Tenure Status,Toilet Facilities,Electricity,Main Source of Water Supply,Number of Television,Number of CD/VCD/DVD,Number of Component/Stereo set,Number of Refrigerator/Freezer,Number of Washing Machine,Number of Airconditioner,"Number of Car, Jeep, Van",Number of Landline/wireless telephones,Number of Cellular phone,Number of Personal Computer,Number of Stove with Oven/Gas Range,Number of Motorized Banca,Number of Motorcycle/Tricycle,Total Household Income
0,22617,CAR,81940,Enterpreneurial Activities,1,44171,40336,9053,4499,5245,6625,2410,145,0,1769,12300,3600,544,2700,636,0,5466,5150,22300,46100,Male,63,Married,Elementary Graduate,With Job/Business,Carpenters and joiners,Worked for private establishment,Extended Family,6,2,1,2,Single house,"Strong material(galvanized,iron,al,tile,concre...",Strong,342,30,2,Own or owner-like possession of house and lot,"Water-sealed, sewer septic tank, used exclusiv...",1,"Protected spring, river, stream, etc",0,0,0,0,1,0,0,0,2,0,0,0,0,115835
1,21389,V - Bicol Region,26176,Other sources of Income,0,14477,13067,722,3707,755,1560,260,280,235,1725,5502,3600,813,228,138,0,4722,0,0,5460,Male,73,Married,No Grade Completed,With Job/Business,Inland and coastal waters fishermen,Self-employed wihout any employee,Single Family,2,0,0,1,Single house,"Light material (cogon,nipa,anahaw)",Light,20,3,0,"Own house, rent-free lot with consent of owner","Water-sealed, sewer septic tank, shared with o...",0,"Own use, tubed/piped deep well",0,0,0,0,0,0,0,0,0,0,0,0,0,44339


In [4]:
numerics = ['object']

newdf = train_data.select_dtypes(include=numerics)
newdf.columns

Index(['Region', 'Main Source of Income', 'Household Head Sex',
       'Household Head Marital Status',
       'Household Head Highest Grade Completed',
       'Household Head Job or Business Indicator', 'Household Head Occupation',
       'Household Head Class of Worker', 'Type of Household',
       'Type of Building/House', 'Type of Roof', 'Type of Walls',
       'Tenure Status', 'Toilet Facilities', 'Main Source of Water Supply'],
      dtype='object')

In [5]:
response_variable = 'Total Household Income'

continuous_vars = ['Total Food Expenditure', 
                   'Bread and Cereals Expenditure', 'Total Rice Expenditure',
                   'Meat Expenditure', 'Total Fish and  marine products Expenditure',
                   'Fruit Expenditure', 'Vegetables Expenditure',
                   'Restaurant and hotels Expenditure', 'Alcoholic Beverages Expenditure',
                   'Tobacco Expenditure', 'Clothing, Footwear and Other Wear Expenditure',
                   'Housing and water Expenditure', 'Imputed House Rental Value',
                   'Medical Care Expenditure', 'Transportation Expenditure','Communication Expenditure', 'Education Expenditure',
                   'Miscellaneous Goods and Services Expenditure', 'Special Occasions Expenditure', 'Crop Farming and Gardening expenses',
                   'Total Income from Entrepreneurial Acitivites', 'House Floor Area']

nominal_vars = ['Agricultural Household indicator', 'Region', 'Main Source of Income', 
                'Household Head Sex','Household Head Marital Status',
                'Household Head Highest Grade Completed',
                'Household Head Job or Business Indicator', 'Household Head Occupation',
                'Household Head Class of Worker', 'Type of Household',
                'Type of Building/House', 'Type of Roof', 'Type of Walls',
                'Tenure Status', 'Toilet Facilities', 'Main Source of Water Supply']

interact_continuous_vars = ['Total Food Expenditure', 'Vegetables Expenditure',
                   'Alcoholic Beverages Expenditure','Tobacco Expenditure',
                   'Housing and water Expenditure', 'Imputed House Rental Value',
                   'Medical Care Expenditure', 'Education Expenditure',
                   'Crop Farming and Gardening expenses',
                   'Total Income from Entrepreneurial Acitivites', 'House Floor Area']

interact_nominal_vars = ['Agricultural Household indicator', 'Main Source of Income', 
                'Household Head Sex','Household Head Marital Status',
                'Household Head Class of Worker',
                'Type of Building/House',
                'Tenure Status']

binary_vars =  ['Electricity']

ordinal_vars = ['Household Head Age','Number of bedrooms','House Age','Number of Television', 'Number of CD/VCD/DVD',
                'Total Number of Family members','Number of Component/Stereo set', 'Number of Refrigerator/Freezer',
                'Number of Washing Machine', 'Number of Airconditioner',
                'Number of Car, Jeep, Van', 'Number of Landline/wireless telephones',
                'Number of Cellular phone', 'Number of Personal Computer',
                'Number of Stove with Oven/Gas Range', 'Number of Motorized Banca',
                'Number of Motorcycle/Tricycle','Members with age less than 5 year old','Members with age 5 - 17 years old','Total number of family members employed'] 

In [6]:
#Split the columns into continuous, categorical
train_continuous = train_data[continuous_vars]
train_categorical = train_data[nominal_vars + binary_vars]
train_ordinal = train_data[ordinal_vars]
train_y = train_data[response_variable]

test_continuous = test_data[continuous_vars]
test_categorical = test_data[nominal_vars + binary_vars]
test_ordinal = test_data[ordinal_vars]


print('Continuous Set has ' + str(len(train_continuous.columns)) + ' columns')
print('Categorical Set has ' + str(len(train_categorical.columns)) + ' columns')
print('Ordinal Set has ' + str(len(train_ordinal.columns)) + ' columns')

Continuous Set has 22 columns
Categorical Set has 17 columns
Ordinal Set has 20 columns


### Continuous Variables Neural Network

In [7]:
# Scale the continuous variables
xScaler = StandardScaler()
train_continuous_scaled = pd.DataFrame(xScaler.fit_transform(train_continuous), columns=continuous_vars)
test_continuous_scaled = pd.DataFrame(xScaler.transform(test_continuous), columns=continuous_vars)

yScaler = StandardScaler()
train_y_scaled = pd.DataFrame(yScaler.fit_transform(train_y))



In [None]:
# Configure the Neural Network
model1 = NeuralNetwork(len(train_continuous_scaled.columns))
model1.add_layer(30, "tanh").add_layer(1, "identity", bias = False)

In [None]:
# Train the Neural Network
model1.fit(train_continuous_scaled.astype(float).as_matrix(), pd.DataFrame(train_y_scaled).astype(float).as_matrix(), eta = 0.0001, epochs = 10)

In [None]:
to_submission(yScaler.inverse_transform(model1.predict(xScaler.transform(test_continuous))))

In [None]:
crossval_rmse(model1, train_continuous_scaled, train_y_scaled, yScaler, ep=25)

### Categorical Variable Neural Network

In [15]:
train_categorical_dummied = dummyify(train_categorical, nominal_vars)
test_categorical_dummied = dummyify(test_categorical, nominal_vars)

train_ordinal_encoded = rank_hot_encode(train_ordinal, ordinal_vars)
test_ordinal_encoded = rank_hot_encode(test_ordinal, ordinal_vars)

In [16]:
all_categorical = train_categorical_dummied.join(train_ordinal_encoded)
all_categorical_test = test_categorical_dummied.join(test_ordinal_encoded)
print("# Columns for train data =", len(all_categorical.columns))
print("# Columns for test data =", len(all_categorical_test.columns))

# Columns for train data = 798
# Columns for test data = 674


In [None]:
# Configure the Neural Network
model2 = NeuralNetwork(len(all_categorical.columns))
model2.add_layer(64, "tanh").add_layer(1, "identity", bias = False)

In [None]:
# Train the Neural Network
nn.fit(train_categorical_scaled, train_y_scaled, eta = 0.0001, epochs = 10)

In [None]:
crossval_rmse(model2, all_categorical, train_y_scaled, yScaler, ep=10)

### All Variables Neural Network

In [17]:
train_all_scaled = train_continuous_scaled.join(all_categorical)
test_all_scaled = test_continuous_scaled.join(all_categorical_test)
print("# Columns for train data =", len(train_all_scaled.columns))
print("# Columns for test data =", len(test_all_scaled.columns))
len(test_all_scaled.columns) == len(train_all_scaled.columns)

# Columns for train data = 820
# Columns for test data = 696


False

In [None]:
# Configure the Neural Network
model3 = NeuralNetwork(len(train_all_scaled.columns))
model3.add_layer(64, "relu").add_layer(32, "relu").add_layer(16, "relu").add_layer(1, "identity", bias = False)

In [None]:
# Train the Neural Network
model3.fit(train_all_scaled, train_y_scaled, eta = 0.0001, epochs = 40, l2_value=5)

In [None]:
crossval_rmse(model3, train_all_scaled, train_y_scaled, yScaler, ep=10, cv=3)

### Any Features Neural Network

In [18]:
interact_categorical_df_train = dummyify(train_categorical[interact_nominal_vars], interact_nominal_vars)
interact_categorical_df_test = dummyify(test_categorical[interact_nominal_vars], interact_nominal_vars)

In [19]:
# Adds interaction variables
train_all = transform_x(train_all_scaled, interact_categorical_df_train)
test_all = transform_x(test_all_scaled, interact_categorical_df_test)
print("# Columns for train all data =", len(train_all.columns))
print("# Columns for test all data =", len(test_all.columns))

# Columns for train all data = 1128
# Columns for test all data = 982


In [20]:
# Find the common columns
common_cols = list(set(train_all.columns) & set(test_all.columns))
common_cols.sort()
len(common_cols)

970

In [21]:
# Shrink the train and test datasets to only the common columns
train_all_final = train_all[common_cols]
test_all_final = test_all[common_cols]
len(train_all_final.columns) == len(test_all_final.columns)

True

In [22]:
# Configure the Neural Network
model4 = NeuralNetwork(len(train_all_final.columns))
model4.add_layer(64, "relu").add_layer(32, "relu").add_layer(16, "relu").add_layer(1, "identity", bias = False)

<nn.NeuralNetwork at 0x7f95e9c49278>

In [25]:
# Train the Neural Network
model4.fit(train_all_final, train_y_scaled, eta = 0.00001, epochs = 20, l2_value=5)

Inputs have been converted. Training starting now.
Epoch 0 	Training cost: 0.205895395466
Epoch 0 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
Epoch 1 	Training cost: 0.205894497471
Epoch 1 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
Epoch 2 	Training cost: 0.205751382998
Epoch 2 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
Epoch 3 	Training cost: 0.205531732904
Epoch 3 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
Epoch 4 	Training cost: 0.205536400891
Epoch 4 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
Epoch 5 	Training cost: 0.204621817578
Epoch 5 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
E

KeyboardInterrupt: 

In [None]:
def optimize_NN( 
                neurons = [50,100],
                epoches = [20],
                etas = [.01],
                activation_functions = ['relu']):
    
    score_to_beat = np.inf
    optimal_hyperparams = []
    
    for neuron in neurons:
        for activation_function in activation_functions:
            for epoch in epoches:
                for eta in etas:
                    print([neuron, activation_function, epoch, eta])
                    net = NeuralNetwork(len(train_all_final.columns)).add_layer(neuron, activation_function).add_layer(1, "identity", bias = False)
                    rmse = crossval_rmse(net, train_all_final, train_y_scaled, yScaler, ep=epoch, cv = 4, et = eta)
                    print(rmse)
                    if rmse < score_to_beat:
                        score_to_beat = rmse
                        optimal_hyperparams = [neuron, activation_function, epoch, eta, rmse]
                    
    return optimal_hyperparams

In [None]:
optimize_NN()

In [26]:
crossval_rmse(model4, train_all_final, train_y_scaled, yScaler, ep=10, cv = 4, et = 0.001)

Training on fold  0
Inputs have been converted. Training starting now.
Epoch 0 	Training cost: 0.22148049221
Epoch 0 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
Epoch 1 	Training cost: 0.221381831282
Epoch 1 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
Epoch 2 	Training cost: 0.222874468988
Epoch 2 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
Epoch 3 	Training cost: 0.217777994219
Epoch 3 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
Epoch 4 	Training cost: 0.217754473167
Epoch 4 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
Epoch 5 	Training cost: 0.218400392662
Epoch 5 |████████████████████████████████████████████████████████████████████████████████████████████

135120.68345387612

In [None]:
# Predict with the Neural Network against training data for meterics purposes
predicted_y_scaled_test = nn.predict(test_all_final)
predicted_y_test = yScaler.inverse_transform(predicted_y_scaled_test.reshape(predicted_y_scaled_test.shape[0],))

### Functions

In [9]:
def interact_all(df, interact_categorical_df):
    for continuous in interact_continuous_vars:
        for categorical in interact_categorical_df.columns:
            interation_name = "interatct_" + continuous + "_" + categorical
            df[interation_name] = df[continuous] * df[categorical]
    
    return df

In [10]:
def transform_x(df, interact_categorical_df):
    df = interact_all(df, interact_categorical_df)
    return df

In [11]:
def dummyify(df, 
             cols_to_dummy = []
            ):
    """
    DESCRIPTION:
        * Converts a categorical variable in a Pandas DataFrame to dummy/indicator variables.
          Drops one of the dummy columns to create our baseline for modeling. 
          Drops original column containing categorical variable.
        
    PARAMS:
        * df            --> Pandas DataFrame containing data.
        * col_to_dummy  --> The name of the categorical variable to convert to a dummy variable.
        * col_to_drop   --> The name of one of the dummy variable columns to drops. 
                            Defaults to the first value of col_to_dummy.
        
    RETURNS:
        Pandas DataFrame with the categorical variable converted to dummy variables.
    """
    
    if not cols_to_dummy:
        raise ValueError("Please provide column to dummy")
        
    for col in cols_to_dummy:
        dummies = pd.get_dummies( df[col] )
        
        new_cols = [col + '_' + str(old_col) for old_col in dummies.columns] # Have to do this for ordinal variables
        dummies.columns = new_cols 
        col_to_drop = dummies.columns[0]

        df = pd.concat( [df, dummies], axis=1 ).drop( [col_to_drop, col], axis=1 )

    return df

In [12]:
def rank_hot_encode(df, 
                    cols_to_encode = []
                   ):
    """
    DESCRIPTION:
        * Converts ordinal variables in a Pandas DataFrame to continuous variables.
          Drops one of the dummy columns to create our baseline for modeling. 
          Drops original column containing categorical variable.
        
    PARAMS:
        * df            --> Pandas DataFrame containing data.
        * col_to_dummy  --> The name of the ordinal variable to convert to a dummy variable.
        
    RETURNS:
        Pandas DataFrame with the categorical variable converted to dummy variables.
    """
    for col_to_encode in cols_to_encode:
        dummied = pd.get_dummies(df[col_to_encode], drop_first = True)
        encoded_values = dummied.columns
        
        for index in range(0, len(dummied)):
            row = dummied.loc[index]
            if(sum(row) == 0):
                continue
            for col in encoded_values:
                if(row[col] == 0):
                    row[col] = 1
                else:
                    break

        new_cols = [col_to_encode + '_' + str(old_col) for old_col in dummied.columns]
        dummied.columns = new_cols 
        
        df = pd.concat( [df, dummied], axis=1 ).drop( [col_to_encode], axis=1 )
    return df

In [13]:
def crossval_rmse(nn,
                  x,
                  y,
                  y_scaler,
                  cv = 5,
                  ep = 30,
                  et = 0.0001):
    
    folds = ShuffleSplit(n_splits=cv, test_size=1/cv, random_state=0)
    split_indices = folds.split(x)
    train_indices, test_indices = zip(*[(x,y) for x,y in split_indices])
    
    mse = []
    for i in range(len(train_indices)):
        print("Training on fold " , i)
        train_x = x.loc[train_indices[i]]
        train_y = y.loc[train_indices[i]]
        test_x = x.loc[test_indices[i]]
        test_y = y_scaler.inverse_transform(y.loc[test_indices[i]])
        
        nn.fit(train_x.astype(float).as_matrix(), pd.DataFrame(train_y).astype(float).as_matrix(), eta = et, epochs = ep, l2_value=5)
        
        pred = y_scaler.inverse_transform(nn.predict(test_x))
        mse.append(mean_squared_error(test_y, pred))
        
    return np.sqrt(np.average(mse))

In [14]:
def to_submission(pred):
    output = test_data
    output['Total Household Income'] = pred
    submission = test_data[['Index','Total Household Income']]
    submission.to_csv('submission_NN.csv', index=False)
    return