In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
income = pd.read_csv('Family Income and Expenditure.csv')

In [3]:
income.head()

Unnamed: 0,Total Household Income,Region,Total Food Expenditure,Main Source of Income,Agricultural Household indicator,Bread and Cereals Expenditure,Total Rice Expenditure,Meat Expenditure,Total Fish and marine products Expenditure,Fruit Expenditure,...,Number of Refrigerator/Freezer,Number of Washing Machine,Number of Airconditioner,"Number of Car, Jeep, Van",Number of Landline/wireless telephones,Number of Cellular phone,Number of Personal Computer,Number of Stove with Oven/Gas Range,Number of Motorized Banca,Number of Motorcycle/Tricycle
0,480332,CAR,117848,Wage/Salaries,0,42140,38300,24676,16806,3325,...,1,1,0,0,0,2,1,0,0,1
1,198235,CAR,67766,Wage/Salaries,0,17329,13008,17434,11073,2035,...,0,1,0,0,0,3,1,0,0,2
2,82785,CAR,61609,Wage/Salaries,1,34182,32001,7783,2590,1730,...,0,0,0,0,0,0,0,0,0,0
3,107589,CAR,78189,Wage/Salaries,0,34030,28659,10914,10812,690,...,0,0,0,0,0,1,0,0,0,0
4,189322,CAR,94625,Wage/Salaries,0,34820,30167,18391,11309,1395,...,1,0,0,0,0,3,0,0,0,1


In [4]:
income.columns

Index(['Total Household Income', 'Region', 'Total Food Expenditure',
       'Main Source of Income', 'Agricultural Household indicator',
       'Bread and Cereals Expenditure', 'Total Rice Expenditure',
       'Meat Expenditure', 'Total Fish and  marine products Expenditure',
       'Fruit Expenditure', 'Vegetables Expenditure',
       'Restaurant and hotels Expenditure', 'Alcoholic Beverages Expenditure',
       'Tobacco Expenditure', 'Clothing, Footwear and Other Wear Expenditure',
       'Housing and water Expenditure', 'Imputed House Rental Value',
       'Medical Care Expenditure', 'Transportation Expenditure',
       'Communication Expenditure', 'Education Expenditure',
       'Miscellaneous Goods and Services Expenditure',
       'Special Occasions Expenditure', 'Crop Farming and Gardening expenses',
       'Total Income from Entrepreneurial Acitivites', 'Household Head Sex',
       'Household Head Age', 'Household Head Marital Status',
       'Household Head Highest Grade Compl

In [5]:
y = income['Total Household Income']

In [6]:
features = ['Region','Total Food Expenditure','Main Source of Income','Household Head Sex','Household Head Age', 'Household Head Marital Status',
       'Household Head Highest Grade Completed', 'Household Head Occupation','Type of Household',
       'Total Number of Family members','Total number of family members employed']

In [7]:
X = income[features]

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [11]:
categorical_col = [cname for cname in X_train.columns if X_train[cname].nunique()<10 and X_train[cname].dtype == 'object']

In [12]:
numerical_col = [cname for cname in X_train.columns if X_train[cname].dtype in ['float64','float32']]

In [13]:
categorical_transform = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('one_hot', OneHotEncoder(handle_unknown = 'ignore', sparse = False))
])

In [14]:
numerical_transform = SimpleImputer(strategy = 'constant')

In [20]:
preprocessing = ColumnTransformer(transformers=[
    ('cat', categorical_transform, categorical_col),
    ('num', numerical_transform ,numerical_col)
])

In [21]:
from sklearn.ensemble import RandomForestRegressor

In [22]:
from sklearn.metrics import mean_absolute_error

In [25]:
def score(n, X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=n, random_state=42)
    my_pip = Pipeline(steps = [('prepros', preprocessing),('model', model)])
    my_pip.fit(X_train, y_train)
    pred = my_pip.predict(X_test)
    mae = mean_absolute_error(pred, y_test)
    return mae

In [26]:
for n in [5,10,50,100,500]:
    my_score = score(n, X_train, X_test, y_train, y_test)
    print(n, ' : ', my_score)

5  :  151519.13711864836
10  :  151470.304679283
50  :  151567.33721419045
100  :  151559.94334557548
500  :  151549.1181045742
