In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from scipy import stats
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import linear_model
from sklearn.linear_model import SGDRegressor
from category_encoders import TargetEncoder
from sklearn.feature_selection import SelectKBest, f_regression
import xgboost
from xgboost import XGBRegressor

In [2]:
instance = None
target = 'Total Yearly Income [EUR]'
training_columns = [
    'Year of Record',
    'Age',
    'Gender',
    'Country',
    'Size of City',
    'Housing Situation',
    'University Degree',
    'Crime Level in the City of Employement',
    'Work Experience in Current Job [years]',
    'Satisfation with employer',
    'Yearly Income in addition to Salary (e.g. Rental Income)',
    'Wears Glasses',
    'Hair Color',
    'Profession',
    'Body Height [cm]',
    target
]

In [3]:
def read_data():
    training_df = pd.read_csv('./tcd-ml-comp-201920-income-pred-group/tcd-ml-1920-group-income-train.csv')
    test_df = pd.read_csv('./tcd-ml-comp-201920-income-pred-group/tcd-ml-1920-group-income-test.csv')
    instance = test_df['Instance']
    return training_df, test_df

In [4]:
def preprocessing(df):
    df = df[training_columns]
    preprocess_gender(df)
    preprocess_year_of_record(df)
    preprocess_age(df)
    preprocess_university_degree(df)
    preprocess_housing_situation(df)
    preprocess_work_experience(df)
    preprocess_satisfaction_with_employer(df)
    preprocess_extra_income(df)
    preprocess_crime_level(df)
    preprocess_haircolor(df)
    preprocess_profession(df)
    drop_wears_glasses(df)
    return df  

In [5]:
def preprocess_profession(df):
    df['Profession'].fillna('unknown', inplace=True)

In [6]:
def preprocess_gender(df):
    df['Gender'].replace('0', 'unknown', inplace = True)
    df['Gender'].fillna('unknown', inplace=True)

In [7]:
def preprocess_year_of_record(df):
    df['Year of Record'].fillna(df['Year of Record'].median(), inplace=True)

In [8]:
def preprocess_age(df):
    df['Age'].fillna(df['Age'].median(), inplace=True)

In [9]:
def preprocess_university_degree(df):
    df["University Degree"].replace('0', 'No', inplace=True)
    df['University Degree'].fillna('unknown', inplace=True)

In [10]:
def preprocess_housing_situation(df):
    df["Housing Situation"].replace('0', 'unknown', inplace=True)
    df["Housing Situation"].replace(0, 'unknown', inplace=True)
    df["Housing Situation"].replace('nA', 'unknown', inplace=True)
    df['Housing Situation'].fillna('unknown', inplace=True)

In [11]:
def preprocess_work_experience(df):
    df['Work Experience in Current Job [years]'].replace('#NUM!', '0', inplace=True)
    work_ex = [float(x) for x in df['Work Experience in Current Job [years]']]
    df['Work Experience in Current Job [years]'] = work_ex
    df['Work Experience in Current Job [years]'].fillna(df['Work Experience in Current Job [years]'].median(), inplace=True)

In [12]:
def preprocess_satisfaction_with_employer(df):
    df['Satisfation with employer'].fillna('unknown', inplace=True)

In [13]:
def preprocess_extra_income(df):
    extra = [x.replace(' EUR', '') for x in df['Yearly Income in addition to Salary (e.g. Rental Income)']]
    numerical_extra = [float(x) for x in extra]
    df['Yearly Income in addition to Salary (e.g. Rental Income)'] = numerical_extra
    df['Yearly Income in addition to Salary (e.g. Rental Income)'].fillna(df['Yearly Income in addition to Salary (e.g. Rental Income)'].median(), inplace=True)

In [14]:
def preprocess_crime_level(df):
    df['Crime Level in the City of Employement'].fillna(df['Crime Level in the City of Employement'].median(), inplace=True)

In [15]:
def preprocess_haircolor(df):
    df['Hair Color'].fillna('unknown', inplace=True)
    df.drop(['Hair Color'], axis=1, inplace=True)

In [16]:
def drop_wears_glasses(df):
    df.drop(['Wears Glasses'], axis=1, inplace=True)

In [17]:
def target_encode(train, test, columns):
    enc = TargetEncoder(columns)
    enc.fit(train, test)
    train = enc.transform(train)
    test = enc.transform(test)

# Read Training and Test Data

In [18]:
training_df, test_df = read_data()

  if (yield from self.run_code(code, result)):
  if (yield from self.run_code(code, result)):


# Preprocess the data

In [19]:
training_data_preprocessed = preprocessing(training_df)
test_data_preprocessed = preprocessing(test_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [20]:
training_data_preprocessed.head()

Unnamed: 0,Year of Record,Age,Gender,Country,Size of City,Housing Situation,University Degree,Crime Level in the City of Employement,Work Experience in Current Job [years],Satisfation with employer,Yearly Income in addition to Salary (e.g. Rental Income),Profession,Body Height [cm],Total Yearly Income [EUR]
0,1940.0,45,other,Afghanistan,25179,unknown,No,33,17.0,Unhappy,0.0,group head,182,6182.05
1,1940.0,17,female,Afghanistan,2278204,unknown,No,25,4.9,Unhappy,0.0,heavy vehicle and mobile equipment service tec...,172,6819.69
2,1940.0,48,female,Afghanistan,822134,unknown,Bachelor,34,21.0,Unhappy,0.0,sorter,144,8663.53
3,1940.0,42,female,Albania,59477,unknown,No,70,18.0,Average,0.0,quality control senior engineer,152,2400.64
4,1940.0,15,other,Albania,23494,unknown,Master,51,8.0,Happy,0.0,logistician,180,2816.18


In [21]:
test_data_preprocessed.head()

Unnamed: 0,Year of Record,Age,Gender,Country,Size of City,Housing Situation,University Degree,Crime Level in the City of Employement,Work Experience in Current Job [years],Satisfation with employer,Yearly Income in addition to Salary (e.g. Rental Income),Profession,Body Height [cm],Total Yearly Income [EUR]
0,1994.0,23,other,Serbia,734369,Small House,Bachelor,23,12.0,Average,0.0,quality assurance specialist,151,
1,1964.0,44,unknown,Austria,897352,Castle,Master,16,20.0,Average,0.0,student data analyst,181,
2,1974.0,21,unknown,Serbia,766,Large House,Bachelor,22,11.0,Average,0.0,project manager,179,
3,1997.0,24,male,Sierra Leone,1150488,Large Apartment,Bachelor,41,14.0,Average,0.0,staff engineer / architect,201,
4,1949.0,42,male,Ecuador,98532,unknown,No,22,17.0,Happy,0.0,machinist,163,


In [22]:
test_data_preprocessed.head()

Unnamed: 0,Year of Record,Age,Gender,Country,Size of City,Housing Situation,University Degree,Crime Level in the City of Employement,Work Experience in Current Job [years],Satisfation with employer,Yearly Income in addition to Salary (e.g. Rental Income),Profession,Body Height [cm],Total Yearly Income [EUR]
0,1994.0,23,other,Serbia,734369,Small House,Bachelor,23,12.0,Average,0.0,quality assurance specialist,151,
1,1964.0,44,unknown,Austria,897352,Castle,Master,16,20.0,Average,0.0,student data analyst,181,
2,1974.0,21,unknown,Serbia,766,Large House,Bachelor,22,11.0,Average,0.0,project manager,179,
3,1997.0,24,male,Sierra Leone,1150488,Large Apartment,Bachelor,41,14.0,Average,0.0,staff engineer / architect,201,
4,1949.0,42,male,Ecuador,98532,unknown,No,22,17.0,Happy,0.0,machinist,163,


In [23]:
y = training_data_preprocessed[target]
training_data_preprocessed.drop(target, axis=1, inplace=True)
test_data_preprocessed.drop(target, axis=1, inplace=True)

# Target Encoding

In [24]:
enc = TargetEncoder(cols=['Gender', 'Country', 'Profession', 'University Degree', 'Housing Situation', 'Satisfation with employer'])

In [25]:
enc.fit(training_data_preprocessed, y)

TargetEncoder(cols=['Gender', 'Country', 'Profession', 'University Degree', 'Housing Situation', 'Satisfation with employer'],
       drop_invariant=False, handle_missing='value',
       handle_unknown='value', min_samples_leaf=1, return_df=True,
       smoothing=1.0, verbose=0)

In [26]:
training_data_preprocessed = enc.transform(training_data_preprocessed)
test_data_preprocessed = enc.transform(test_data_preprocessed)

In [27]:
training_data_preprocessed.head()

Unnamed: 0,Year of Record,Age,Gender,Country,Size of City,Housing Situation,University Degree,Crime Level in the City of Employement,Work Experience in Current Job [years],Satisfation with employer,Yearly Income in addition to Salary (e.g. Rental Income),Profession,Body Height [cm]
0,1940.0,45,71399.961738,191828.953279,25179,9923.185537,60670.03207,33,17.0,157393.908986,0.0,55301.387315,182
1,1940.0,17,50288.77523,191828.953279,2278204,9923.185537,60670.03207,25,4.9,157393.908986,0.0,62616.40527,172
2,1940.0,48,50288.77523,191828.953279,822134,9923.185537,68112.302736,34,21.0,157393.908986,0.0,70670.102123,144
3,1940.0,42,50288.77523,135701.550567,59477,9923.185537,60670.03207,70,18.0,65225.792792,0.0,63171.291534,152
4,1940.0,15,71399.961738,135701.550567,23494,9923.185537,76785.113519,51,8.0,68667.004068,0.0,66590.907652,180


In [28]:
test_data_preprocessed.head()

Unnamed: 0,Year of Record,Age,Gender,Country,Size of City,Housing Situation,University Degree,Crime Level in the City of Employement,Work Experience in Current Job [years],Satisfation with employer,Yearly Income in addition to Salary (e.g. Rental Income),Profession,Body Height [cm]
0,1994.0,23,71399.961738,55455.05505,734369,77083.612777,68112.302736,23,12.0,65225.792792,0.0,62898.90573,151
1,1964.0,44,68594.057012,56109.05623,897352,15257.786113,76785.113519,16,20.0,65225.792792,0.0,86835.876816,181
2,1974.0,21,68594.057012,55455.05505,766,24359.294031,68112.302736,22,11.0,65225.792792,0.0,60978.797749,179
3,1997.0,24,68722.119911,54394.105047,1150488,152213.276291,68112.302736,41,14.0,65225.792792,0.0,79762.44098,201
4,1949.0,42,68722.119911,126564.649291,98532,9923.185537,60670.03207,22,17.0,68667.004068,0.0,70288.980982,163


# Training the Model

In [29]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(training_data_preprocessed, y, test_size=0.2, random_state=0)

In [40]:
reg = GradientBoostingRegressor(n_estimators=1500)

In [41]:
y_train_log = np.log(Ytrain)

In [None]:
reg.fit(Xtrain, y_train_log)

In [33]:
y_prediction = np.exp(reg.predict(Xtest))

In [34]:
y_pred = np.exp(reg.predict(Xtest))

In [35]:
y_pred

array([145709.89557578,  13197.19570631,  27506.69775102, ...,
         1134.22985715,  20427.99440345,  63108.24550342])

In [36]:
Ytest

875680     183449.86
1046905     10789.73
646861      25216.55
704385      37150.19
798051     182954.48
800217      63879.17
267711       1786.26
363947       5292.42
1018795    234854.69
980270     340943.29
883731     134002.71
85501        8097.24
705535     126357.30
886840      85100.20
454781      96340.44
166120       2618.65
326339      23813.50
1037137    342888.64
640013      35159.16
469166      31246.98
169550       5005.68
297097       7571.57
523773      61588.33
988379     168903.21
642801      40807.38
954923     229085.02
777243     155606.37
383661      19294.82
197564       4986.80
957493     335807.05
             ...    
201389       1589.86
55498        1051.92
336667       3263.25
671305      50741.09
772617      47067.45
614817     143986.36
8603          590.42
59121         931.90
356070      16627.22
744864      55056.78
165403      46545.55
948746      78853.09
409467      28705.94
395204       7253.35
268223       1641.27
850702      92745.17
1006022    34

In [37]:
Y_actual_prediction = np.exp(reg.predict(test_data_preprocessed))

In [38]:
submission_df = pd.DataFrame({'Instance': instance, 'Total Yearly Income [EUR]': Y_actual_prediction})

In [None]:
instance

In [39]:
submission_df.to_csv('Submission_latest_3.csv', index=False)