In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA

from xgboost import XGBRegressor

from scipy.stats import randint as sp_randint

from keras.models import Sequential
from keras.layers import Dense

In [2]:
train = pd.read_csv('train.csv')
train

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8405,107.39,ak,s,as,c,d,aa,d,q,...,1,0,0,0,0,0,0,0,0,0
4205,8406,108.77,j,o,t,d,d,aa,h,h,...,0,1,0,0,0,0,0,0,0,0
4206,8412,109.22,ak,v,r,a,d,aa,g,e,...,0,0,1,0,0,0,0,0,0,0
4207,8415,87.48,al,r,e,f,d,aa,l,u,...,0,0,0,0,0,0,0,0,0,0


This dataset contains an anonymized set of variables, each representing a custom feature in a Mercedes car. For example, a variable could be 4WD, added air suspension, or a head-up display.

The ground truth is labeled ‘y’ and represents the time (in seconds) that the car took to pass testing for each variable.

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [4]:
train.describe()

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4205.960798,100.669318,0.013305,0.0,0.075077,0.057971,0.42813,0.000475,0.002613,0.007603,...,0.318841,0.057258,0.314802,0.02067,0.009503,0.008078,0.007603,0.001663,0.000475,0.001426
std,2437.608688,12.679381,0.11459,0.0,0.263547,0.233716,0.494867,0.021796,0.051061,0.086872,...,0.466082,0.232363,0.464492,0.142294,0.097033,0.089524,0.086872,0.040752,0.021796,0.037734
min,0.0,72.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2095.0,90.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4220.0,99.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6314.0,109.01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8417.0,265.32,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
X, y = train.drop(['ID', 'y'], axis=1), train['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
cols_num = X.select_dtypes(include = ['int64', 'float64']).columns
cols_cat = X.select_dtypes(include = ['object']).columns

trans_num = StandardScaler()

In [7]:
transformer_numerical = Pipeline(steps = [('scaler', StandardScaler()),
                                          ('pca', PCA(n_components=0.95))])
transformer_categorical = Pipeline(steps = [('encoder', OneHotEncoder(drop='first',
                                                                     handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers = [('numerical', transformer_numerical, cols_num),
                                               ('categorical', transformer_categorical, cols_cat)])

In [8]:
pipe = Pipeline(steps = [('preprocessor', preprocessor), ('model', None)])

In [None]:
param_grid = [
    {
        'model__n_estimators' : sp_randint(200, 500),
        'model__max_depth' : np.arange(1, 11, 1),
        'model__min_samples_split' : np.arange(2, 10, 1),
        'model__min_samples_leaf' : sp_randint(1, 12),
        'model__ccp_alpha' : [0, 0.0001, 0.001, 0.01, 0.1, 5, 10, 50, 100, 500, 1000],
        'model': [RandomForestRegressor()]
    },
    {
        'preprocessor__numerical__scaler' : [StandardScaler(), MinMaxScaler()],
        'model__learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
        'model__alpha' : [0, 0.0001, 0.001, 0.01, 0.1, 5, 10, 50, 100, 500, 1000],
        'model': [SGDRegressor()]
    },
    {
        'model__n_estimators' : sp_randint(10, 100),
        'model__max_depth': np.arange(2, 9),
        'model__learning_rate': np.arange(0.01, 0.3, 0.01),
        'model__colsample_bytree': np.arange(0.1, 1, 0.1),
        'model__gamma': np.arange(0, 20, 2),
        'model': [XGBRegressor()]
    }

]
grid = RandomizedSearchCV(pipe, param_grid, n_iter=100, scoring='r2')
grid.fit(X_train, y_train)



In [None]:
pd.set_option('display.max_colwidth', 300)
pd.DataFrame(grid.cv_results_).sort_values(by = 'rank_test_score')[['params',
                                                                    'mean_fit_time',
                                                                    'mean_test_score']]

In [None]:
grid.best_estimator_

In [None]:
grid.best_estimator_.fit(X_train, y_train)
r2_score(y_test, grid.best_estimator_.predict(X_test))
#0.5901821334115114
#0.5963850181058674

In [None]:
model = Sequential()
model.add(Dense(12, input_shape=(563,), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
X = pd.get_dummies(X)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
_, accuracy = model.evaluate(X, y)

In [None]:
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
test = pd.read_csv('test.csv')
test

In [None]:
X = test.drop(['ID'], axis=1)

In [None]:
results = pd.DataFrame({'ID' : test['ID'], 'y' : grid.best_estimator_.predict(X)})
results

In [None]:
#results.to_csv('submission.csv', index=False)