In [50]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# Viz
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Feature Engineering
from sklearn.preprocessing import StandardScaler

# Model Storage
from sklearn.externals import joblib

# NN
import keras as k
from keras.models import Sequential
from keras.layers import Dense, Activation

##
from nn import NeuralNetwork
import nn 

In [51]:
train_data = pd.read_csv("/data/project2/train.csv")
test_data = pd.read_csv("/data/project2/test.csv")
print("Train Data Rows: ", len(train_data))
print("Test Data Rows: ", len(test_data))

Train Data Rows:  33235
Test Data Rows:  8309


In [52]:
train_data.head(2)

Unnamed: 0,Index,Region,Total Food Expenditure,Main Source of Income,Agricultural Household indicator,Bread and Cereals Expenditure,Total Rice Expenditure,Meat Expenditure,Total Fish and marine products Expenditure,Fruit Expenditure,Vegetables Expenditure,Restaurant and hotels Expenditure,Alcoholic Beverages Expenditure,Tobacco Expenditure,"Clothing, Footwear and Other Wear Expenditure",Housing and water Expenditure,Imputed House Rental Value,Medical Care Expenditure,Transportation Expenditure,Communication Expenditure,Education Expenditure,Miscellaneous Goods and Services Expenditure,Special Occasions Expenditure,Crop Farming and Gardening expenses,Total Income from Entrepreneurial Acitivites,Household Head Sex,Household Head Age,Household Head Marital Status,Household Head Highest Grade Completed,Household Head Job or Business Indicator,Household Head Occupation,Household Head Class of Worker,Type of Household,Total Number of Family members,Members with age less than 5 year old,Members with age 5 - 17 years old,Total number of family members employed,Type of Building/House,Type of Roof,Type of Walls,House Floor Area,House Age,Number of bedrooms,Tenure Status,Toilet Facilities,Electricity,Main Source of Water Supply,Number of Television,Number of CD/VCD/DVD,Number of Component/Stereo set,Number of Refrigerator/Freezer,Number of Washing Machine,Number of Airconditioner,"Number of Car, Jeep, Van",Number of Landline/wireless telephones,Number of Cellular phone,Number of Personal Computer,Number of Stove with Oven/Gas Range,Number of Motorized Banca,Number of Motorcycle/Tricycle,Total Household Income
0,22617,CAR,81940,Enterpreneurial Activities,1,44171,40336,9053,4499,5245,6625,2410,145,0,1769,12300,3600,544,2700,636,0,5466,5150,22300,46100,Male,63,Married,Elementary Graduate,With Job/Business,Carpenters and joiners,Worked for private establishment,Extended Family,6,2,1,2,Single house,"Strong material(galvanized,iron,al,tile,concre...",Strong,342,30,2,Own or owner-like possession of house and lot,"Water-sealed, sewer septic tank, used exclusiv...",1,"Protected spring, river, stream, etc",0,0,0,0,1,0,0,0,2,0,0,0,0,115835
1,21389,V - Bicol Region,26176,Other sources of Income,0,14477,13067,722,3707,755,1560,260,280,235,1725,5502,3600,813,228,138,0,4722,0,0,5460,Male,73,Married,No Grade Completed,With Job/Business,Inland and coastal waters fishermen,Self-employed wihout any employee,Single Family,2,0,0,1,Single house,"Light material (cogon,nipa,anahaw)",Light,20,3,0,"Own house, rent-free lot with consent of owner","Water-sealed, sewer septic tank, shared with o...",0,"Own use, tubed/piped deep well",0,0,0,0,0,0,0,0,0,0,0,0,0,44339


In [53]:
numerics = ['object']

newdf = train_data.select_dtypes(include=numerics)
newdf.columns

Index(['Region', 'Main Source of Income', 'Household Head Sex',
       'Household Head Marital Status',
       'Household Head Highest Grade Completed',
       'Household Head Job or Business Indicator', 'Household Head Occupation',
       'Household Head Class of Worker', 'Type of Household',
       'Type of Building/House', 'Type of Roof', 'Type of Walls',
       'Tenure Status', 'Toilet Facilities', 'Main Source of Water Supply'],
      dtype='object')

In [54]:
response_variable = 'Total Household Income'

continuous_vars = ['Total Food Expenditure', 
                   'Bread and Cereals Expenditure', 'Total Rice Expenditure',
                   'Meat Expenditure', 'Total Fish and  marine products Expenditure',
                   'Fruit Expenditure', 'Vegetables Expenditure',
                   'Restaurant and hotels Expenditure', 'Alcoholic Beverages Expenditure',
                   'Tobacco Expenditure', 'Clothing, Footwear and Other Wear Expenditure',
                   'Housing and water Expenditure', 'Imputed House Rental Value',
                   'Medical Care Expenditure', 'Transportation Expenditure','Communication Expenditure', 'Education Expenditure',
                   'Miscellaneous Goods and Services Expenditure', 'Special Occasions Expenditure', 'Crop Farming and Gardening expenses',
                   'Total Income from Entrepreneurial Acitivites', 'House Floor Area']

nominal_vars = ['Agricultural Household indicator', 'Region', 'Main Source of Income', 
                'Household Head Sex','Household Head Marital Status',
                'Household Head Highest Grade Completed',
                'Household Head Job or Business Indicator', 'Household Head Occupation',
                'Household Head Class of Worker', 'Type of Household',
                'Type of Building/House', 'Type of Roof', 'Type of Walls',
                'Tenure Status', 'Toilet Facilities', 'Main Source of Water Supply']

binary_vars =  ['Electricity']

ordinal_vars = ['Household Head Age','Number of bedrooms','House Age','Number of Television', 'Number of CD/VCD/DVD',
                'Total Number of Family members','Number of Component/Stereo set', 'Number of Refrigerator/Freezer',
                'Number of Washing Machine', 'Number of Airconditioner',
                'Number of Car, Jeep, Van', 'Number of Landline/wireless telephones',
                'Number of Cellular phone', 'Number of Personal Computer',
                'Number of Stove with Oven/Gas Range', 'Number of Motorized Banca',
                'Number of Motorcycle/Tricycle','Members with age less than 5 year old','Members with age 5 - 17 years old','Total number of family members employed'] 

In [55]:
train_continuous = train_data[continuous_vars]
train_categorical = train_data[nominal_vars + binary_vars + ordinal_vars]
train_ordinal = train_data[ordinal_vars]
train_y = train_data[response_variable]

test_continuous = test_data[continuous_vars]


print('Continuous Set has ' + str(len(train_continuous.columns)) + ' columns')

Continuous Set has 22 columns


In [47]:
xScaler = StandardScaler()
train_continuous_scaled = pd.DataFrame(xScaler.fit_transform(train_continuous))
test_continuous_scaled = pd.DataFrame(xScaler.transform(test_continuous))

yScaler = StandardScaler()
train_y_scaled = pd.DataFrame(yScaler.fit_transform(train_y))



In [48]:
nn = NeuralNetwork(22)
nn.add_layer(30).add_layer(20).add_layer(10).add_layer(1, "identity", bias = False)

<nn.NeuralNetwork at 0x7f11f79897b8>

In [49]:
nn.fit(train_continuous_scaled.astype(float).as_matrix(), pd.DataFrame(train_y_scaled).astype(float).as_matrix(), eta = 0.01, epochs = 10)

OverflowError: Python int too large to convert to C long

In [28]:
x = nn.predict(test_continuous_scaled)

In [30]:
print(x)

[[0]
 [0]
 [0]
 ..., 
 [0]
 [0]
 [0]]
