# Filipino Housing Regression
## DATA 401

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# Models
from sklearn.linear_model import LinearRegression, LassoCV

# Model Selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Model Storage
from sklearn.externals import joblib

In [5]:
train_data = pd.read_csv("/data/project2/train.csv")
test_data = pd.read_csv("/data/project2/test.csv")
print("Train Data Rows: ", len(train_data))
print("Test Data Rows: ", len(test_data))

Train Data Rows:  33235
Test Data Rows:  8309


In [4]:
train_data.head()

Unnamed: 0,Index,Region,Total Food Expenditure,Main Source of Income,Agricultural Household indicator,Bread and Cereals Expenditure,Total Rice Expenditure,Meat Expenditure,Total Fish and marine products Expenditure,Fruit Expenditure,Vegetables Expenditure,Restaurant and hotels Expenditure,Alcoholic Beverages Expenditure,Tobacco Expenditure,"Clothing, Footwear and Other Wear Expenditure",Housing and water Expenditure,Imputed House Rental Value,Medical Care Expenditure,Transportation Expenditure,Communication Expenditure,Education Expenditure,Miscellaneous Goods and Services Expenditure,Special Occasions Expenditure,Crop Farming and Gardening expenses,Total Income from Entrepreneurial Acitivites,Household Head Sex,Household Head Age,Household Head Marital Status,Household Head Highest Grade Completed,Household Head Job or Business Indicator,Household Head Occupation,Household Head Class of Worker,Type of Household,Total Number of Family members,Members with age less than 5 year old,Members with age 5 - 17 years old,Total number of family members employed,Type of Building/House,Type of Roof,Type of Walls,House Floor Area,House Age,Number of bedrooms,Tenure Status,Toilet Facilities,Electricity,Main Source of Water Supply,Number of Television,Number of CD/VCD/DVD,Number of Component/Stereo set,Number of Refrigerator/Freezer,Number of Washing Machine,Number of Airconditioner,"Number of Car, Jeep, Van",Number of Landline/wireless telephones,Number of Cellular phone,Number of Personal Computer,Number of Stove with Oven/Gas Range,Number of Motorized Banca,Number of Motorcycle/Tricycle,Total Household Income
0,22617,CAR,81940,Enterpreneurial Activities,1,44171,40336,9053,4499,5245,6625,2410,145,0,1769,12300,3600,544,2700,636,0,5466,5150,22300,46100,Male,63,Married,Elementary Graduate,With Job/Business,Carpenters and joiners,Worked for private establishment,Extended Family,6,2,1,2,Single house,"Strong material(galvanized,iron,al,tile,concre...",Strong,342,30,2,Own or owner-like possession of house and lot,"Water-sealed, sewer septic tank, used exclusiv...",1,"Protected spring, river, stream, etc",0,0,0,0,1,0,0,0,2,0,0,0,0,115835
1,21389,V - Bicol Region,26176,Other sources of Income,0,14477,13067,722,3707,755,1560,260,280,235,1725,5502,3600,813,228,138,0,4722,0,0,5460,Male,73,Married,No Grade Completed,With Job/Business,Inland and coastal waters fishermen,Self-employed wihout any employee,Single Family,2,0,0,1,Single house,"Light material (cogon,nipa,anahaw)",Light,20,3,0,"Own house, rent-free lot with consent of owner","Water-sealed, sewer septic tank, shared with o...",0,"Own use, tubed/piped deep well",0,0,0,0,0,0,0,0,0,0,0,0,0,44339
2,25275,III - Central Luzon,139920,Enterpreneurial Activities,0,31053,21720,12005,10415,1090,6355,65642,210,2715,4050,29520,12000,1734,24846,2004,5460,7578,4500,0,167900,Male,48,Married,High School Graduate,With Job/Business,General managers/managing proprietors in trans...,Self-employed wihout any employee,Single Family,5,0,4,0,Single house,"Strong material(galvanized,iron,al,tile,concre...",Strong,75,22,0,Own or owner-like possession of house and lot,"Water-sealed, sewer septic tank, used exclusiv...",1,"Own use, faucet, community water system",1,0,0,0,0,0,0,0,1,0,0,0,1,223010
3,25298,III - Central Luzon,108931,Wage/Salaries,0,41374,29008,12250,11840,1455,3551,22000,1350,9050,4970,38160,18000,3988,10638,3888,1850,17178,3680,0,0,Male,39,Married,Second Year High School,With Job/Business,Welders and flamecutters,Worked for private establishment,Extended Family,8,0,5,3,Single house,"Strong material(galvanized,iron,al,tile,concre...",Strong,180,10,1,Own or owner-like possession of house and lot,"Water-sealed, sewer septic tank, used exclusiv...",1,"Own use, faucet, community water system",1,0,0,0,1,0,0,0,2,0,1,0,0,224252
4,7780,V - Bicol Region,61539,Other sources of Income,1,23758,19656,2496,12446,2448,5680,2540,0,8320,3935,21222,12000,12738,8478,648,500,3300,0,50790,50790,Male,44,Married,High School Graduate,With Job/Business,Farmhands and laborers,Worked for private establishment,Single Family,5,1,2,1,Single house,"Strong material(galvanized,iron,al,tile,concre...",Strong,48,1,2,"Own house, rent-free lot with consent of owner","Water-sealed, sewer septic tank, used exclusiv...",1,"Shared, tubed/piped deep well",1,1,0,0,1,0,0,0,1,0,0,0,1,125863


## Feature Engineering

## Model Selection

## Prediction and Output

## Functions

In [None]:
def dummyify(df, 
             col_to_dummy = '',
             col_to_drop = ''):
    """
    DESCRIPTION:
        * Converts a categorical variable in a Pandas DataFrame to dummy/indicator variables.
          Drops one of the dummy columns to create our baseline for modeling. 
          Drops original column containing categorical variable.
        
    PARAMS:
        * df            --> Pandas DataFrame containing data.
        * col_to_dummy  --> The name of the categorical variable to convert to a dummy variable.
        * col_to_drop   --> The name of one of the dummy variable columns to drops. 
                            Defaults to the first value of col_to_dummy.
        
    RETURNS:
        Pandas DataFrame with the categorical variable converted to dummy variables.
    """
    
    if col_to_dummy == '':
        raise ValueError("Please provide column to dummy")
        
    dummies = pd.get_dummies( df[col_to_dummy] )
    if col_to_drop == '':
        col_to_drop = dummies.columns[0]
        
    df = pd.concat( [df, dummies], axis=1 ).drop( [col_to_drop, col_to_dummy], axis=1 )#.drop( col_to_dummy, axis=1 )
    
    return df

In [None]:
def expand_var(df, 
               variable_names = [],
               n_expansion = 3):
    """
    DESCRIPTION:
        * Expands a quantitative varible(s) in a Pandas DataFrame to the nth power.
          Creates a new column for each expanded term(s).
        
    PARAMS:
        * df              --> Pandas DataFrame containing variable(s) to expand.
        * variable_names  --> An array specifying the name(s) of the quantitative variable(s) to expand.
        * n_expansion     --> The degree to which we will expand our variable(s) up to.
        
    RETURNS:
        Pandas DataFrame with the variable expanded to the specified term.
    """
    
    if variable_names == []:
        raise ValueError("Please specify variable_name to expand")
    
    for variable in variable_names:
        for i in range(2, n_expansion+1): 
            expanded_name = (variable + "^%d") % i
            df[expanded_name] = df[variable] ** i
        
    return df