In [1]:
%load_ext autoreload
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
all_data = pd.read_csv('train.csv.zip')
all_data.head(2)

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0


In [3]:
#Removing duplicate or constant columns as per https://www.kaggle.com/yohanb/categorical-features-encoding-xgb-0-554
columns_to_remove = ['X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347', 
                     'X382', 'X232', 'X279', 'X35', 'X37', 'X39', 'X302', 'X113', 'X134', 'X147', 'X222', 
                     'X102', 'X214', 'X239', 'X76', 'X324', 'X248', 'X253', 'X385', 'X172', 'X216', 'X213', 
                     'X84', 'X244', 'X122', 'X243', 'X320', 'X245', 'X94', 'X242', 'X199', 'X119', 'X227', 
                     'X146', 'X226', 'X326', 'X360', 'X262', 'X266', 'X247', 'X254', 'X364', 'X365', 'X296', 'X299',
                     'X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
new_columns = [col for col in all_data.columns if col not in columns_to_remove]
data1 = all_data[new_columns]

In [4]:
data1.shape

(4209, 321)

## Baseline linear regression without categorical features

In [5]:
X = data1.iloc[:,10:].values
y = data1.iloc[:,1].values

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2834)

In [7]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_squared_error
model = LinearRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print('Train R^2: ', model.score(X_train, y_train))
print('Test R^2: ', r2_score(y_test, preds))
print('Test MSE: ', mean_squared_error(y_test, preds))


Train R^2:  0.5843533315823948
Test R^2:  -6.160078533586281e+22
Test MSE:  9.528489855981273e+24


## Label encoding of all columns


In [8]:
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder
test_data = pd.read_csv('test.csv.zip')
combined = pd.concat([all_data, test_data], axis=0, sort=False)

cat_column_names = ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

label_encoders = {}

for col in cat_column_names:
    label_encoder = LabelEncoder()
    label_encoder.fit(combined[col])
    label_encoders[col] = label_encoder


## Mean encoding of columns
Now we will do a naive mean encoding of the categorical columns X0-X8

In [9]:
data2 = data1.copy()

In [10]:
import sys,os, pathlib
current = pathlib.Path(os.getcwd())
base = current.parent.parent
catenc = base.joinpath('categorical-encoding')
sys.path.append(str(catenc))

In [11]:
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.target_encoder import TargetEncoder

cat_column_names = ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

for col in cat_column_names:
    data2[col] = label_encoders[col].transform(data2[col])


X = data2.iloc[:,2:].values
y = data2.iloc[:,1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2834)

mean_enc_columns = [data2.columns.get_loc(c) for c in data2.columns if c in cat_column_names]


m_encoder = LeaveOneOutEncoder(cols=mean_enc_columns)
m_encoder.fit(X_train, y_train)
X_train = m_encoder.transform(X_train)
X_test = m_encoder.transform(X_test)

scaler = RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

model = Ridge(alpha = 20) #Best alpha we could get via hyperparameter tuning
model.fit(X_train, y_train)
preds = model.predict(X_test)
print('Train R^2: ', model.score(X_train, y_train))
print('Test R^2: ', r2_score(y_test, preds))
print('Test MSE: ', mean_squared_error(y_test, preds))

Train R^2:  0.5753189141790043
Test R^2:  0.5875966381290332
Test MSE:  63.79109014170886


OK much better. 

Let's add some non-linearity

In [12]:
from sklearn.preprocessing import PolynomialFeatures
new_features = PolynomialFeatures(degree=3,include_bias=False).fit_transform(X[:,mean_enc_columns])
X1 = np.hstack([X[:, :-8], new_features])
X1.shape

(4209, 475)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=2834)

m_encoder = LeaveOneOutEncoder(cols=mean_enc_columns)
m_encoder.fit(X_train, y_train)
X_train = m_encoder.transform(X_train)
X_test = m_encoder.transform(X_test)

scaler = RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

model = Ridge(alpha=55)
model.fit(X_train, y_train)
preds = model.predict(X_test)
print('Train R^2: ', model.score(X_train, y_train))
print('Test R^2: ', r2_score(y_test, preds))
print('Test MSE: ', mean_squared_error(y_test, preds))

Train R^2:  0.5726433948084718
Test R^2:  0.5690688979007397
Test MSE:  66.65698517627705


The result is actually worse. Is it because we ignored the standard deviation?

We will try to achieve the best result by using random forest

In [14]:
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2834)


#m_encoder = LeaveOneOutEncoder(cols=mean_enc_columns)
m_encoder = TargetEncoder(cols=mean_enc_columns, smoothing=1E-2)
m_encoder.fit(X_train, y_train)
X_train = m_encoder.transform(X_train)
X_test = m_encoder.transform(X_test)


model = RandomForestRegressor(n_estimators=300, max_depth=5, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict(X_test)

print('Train R^2: ', model.score(X_train, y_train))
print('Test R^2: ', r2_score(y_test, preds))
print('Test MSE: ', mean_squared_error(y_test, preds))

Train R^2:  0.6028315218920661
Test R^2:  0.6168638369298609
Test MSE:  59.26400164168092


In [38]:
#generate submission
kaggle1 = test_data[new_columns[2:]].copy()

for col in cat_column_names:
    kaggle1[col] = label_encoders[col].transform(kaggle1[col])
    
X_kaggle = kaggle1.values

In [31]:
X_kaggle_tran = m_encoder.transform(X_kaggle)
preds_kaggle = model.predict(X_kaggle_tran)
preds_kaggle_df = pd.DataFrame({'ID': test_data.ID, 'y': preds_kaggle, })
preds_kaggle_df.head(2)
preds_kaggle_df.to_csv('te_submission.csv', index=False)

ValueError: array length 105225 does not match index length 4209

This is much better than Ridge regression, but without limiting the max depth it overfits tremendously

In [17]:
from category_encoders.multiple_imputation import MultipleImputationEncoder

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2834)


m_encoder = MultipleImputationEncoder(cols=mean_enc_columns, smoothing=1E-2, n_draws=25, random_state=2834)
m_encoder.fit(X_train, y_train)
X_train = m_encoder.transform(X_train)
X_test = m_encoder.transform(X_test)
y_train = m_encoder.expand_y(y_train)


model = RandomForestRegressor(n_estimators=300, max_depth=5, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict(X_test)
preds = m_encoder.average_y(preds)

print('Train R^2: ', model.score(X_train, y_train))
print('Test R^2: ', r2_score(y_test, preds))
print('Test MSE: ', mean_squared_error(y_test, preds))

Train R^2:  0.5890384290055496
Test R^2:  0.6125592934368611
Test MSE:  59.929834045992806


I do not see much difference. We need another example where we can prove better effectiveness of this algorithm.

In [18]:
X_kaggle_tran = m_encoder.transform(X_kaggle)
preds_kaggle = model.predict(X_kaggle_tran)
preds_kaggle = m_encoder.average_y(preds_kaggle)
preds_kaggle_df = pd.DataFrame({'ID': test_data.ID, 'y': preds_kaggle, })
preds_kaggle_df.head(2)
preds_kaggle_df.to_csv('im_submission.csv', index=False)

In [27]:
#%autoreload 2

from sklearn.ensemble import RandomForestRegressor
from category_encoders.hot_deck import HotDeckEncoder

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2834)

print('Encoding...')
m_encoder = HotDeckEncoder(cols=mean_enc_columns, smoothing=1E-2, n_draws=25, random_state=2834)
m_encoder.fit(X_train, y_train)
X_train = m_encoder.transform(X_train)
X_test = m_encoder.transform(X_test)
y_train = m_encoder.expand_y(y_train)

print('Training the model...')
model = RandomForestRegressor(n_estimators=300, max_depth=5, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict(X_test)
preds = m_encoder.average_y(preds)

print('Train R^2: ', model.score(X_train, y_train))
print('Test R^2: ', r2_score(y_test, preds))
print('Test MSE: ', mean_squared_error(y_test, preds))

Encoding...
Training the model...
Train R^2:  0.588942390149034
Test R^2:  0.6125171454499816
Test MSE:  59.93635355160043


In [39]:
X_kaggle_tran = m_encoder.transform(X_kaggle)
preds_kaggle = model.predict(X_kaggle_tran)
preds_kaggle_avg = m_encoder.average_y(preds_kaggle)
preds_kaggle_df = pd.DataFrame({'ID': test_data.ID, 'y': preds_kaggle_avg, })
preds_kaggle_df.head(2)
preds_kaggle_df.to_csv('hd_submission.csv', index=False)

Got slightly worse results than Normal distribution based cat encoding