In [1]:
import pandas as pd
import numpy as np

# Read in  Data

Data taken from [Kaggle Tabular Playground Series](https://www.kaggle.com/c/tabular-playground-series-jan-2021)

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,target
0,1,0.67039,0.8113,0.643968,0.291791,0.284117,0.855953,0.8907,0.285542,0.558245,0.779418,0.921832,0.866772,0.878733,0.305411,7.243043
1,3,0.388053,0.621104,0.686102,0.501149,0.64379,0.449805,0.510824,0.580748,0.418335,0.432632,0.439872,0.434971,0.369957,0.369484,8.203331
2,4,0.83495,0.227436,0.301584,0.293408,0.606839,0.829175,0.506143,0.558771,0.587603,0.823312,0.567007,0.677708,0.882938,0.303047,7.776091
3,5,0.820708,0.160155,0.546887,0.726104,0.282444,0.785108,0.752758,0.823267,0.574466,0.580843,0.769594,0.818143,0.914281,0.279528,6.957716
4,8,0.935278,0.421235,0.303801,0.880214,0.66561,0.830131,0.487113,0.604157,0.874658,0.863427,0.983575,0.900464,0.935918,0.435772,7.951046


In [4]:
test.head()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,0,0.3536,0.73878,0.600939,0.293377,0.285691,0.458006,0.620704,0.422249,0.369203,0.435727,0.55054,0.699134,0.286864,0.364515
1,2,0.907222,0.189756,0.215531,0.869915,0.301333,0.528958,0.390351,0.521112,0.794779,0.79858,0.446475,0.449037,0.916964,0.513002
2,6,0.179287,0.355353,0.623972,0.437812,0.282476,0.320826,0.386789,0.776422,0.222268,0.229102,0.211913,0.222651,0.327164,0.827941
3,7,0.359385,0.181049,0.551368,0.206386,0.280763,0.482076,0.506677,0.362793,0.379737,0.345686,0.445276,0.518485,0.299028,0.598166
4,10,0.335791,0.682607,0.676481,0.219465,0.282861,0.581721,0.748639,0.350158,0.448915,0.506878,0.817721,0.805895,0.790591,0.249275


In [5]:
print("Training data missing values" + "\n" + str(pd.isnull(train).sum()))
print("\n")
print("Testing data missing values" + "\n" + str(pd.isnull(test).sum()))

Training data missing values
id        0
cont1     0
cont2     0
cont3     0
cont4     0
cont5     0
cont6     0
cont7     0
cont8     0
cont9     0
cont10    0
cont11    0
cont12    0
cont13    0
cont14    0
target    0
dtype: int64


Testing data missing values
id        0
cont1     0
cont2     0
cont3     0
cont4     0
cont5     0
cont6     0
cont7     0
cont8     0
cont9     0
cont10    0
cont11    0
cont12    0
cont13    0
cont14    0
dtype: int64


In [6]:
test_id = test.iloc[:, 0]

In [7]:
test_id

0              0
1              2
2              6
3              7
4             10
           ...  
199995    499984
199996    499985
199997    499987
199998    499988
199999    499990
Name: id, Length: 200000, dtype: int64

# Modeling

In [8]:
X_train = train.loc[:, "cont1":"cont14"]
y_train = train.loc[:, "target"]

X_test = test.loc[:, "cont1":"cont14"]

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

## Linear Regression

In [10]:
linear_model = LinearRegression()

linear_model.fit(X_train, y_train)

linear_model_predictions = linear_model.predict(X_test)

In [11]:
linear_model_results = pd.DataFrame({"id": test_id, "target": linear_model_predictions})

In [12]:
linear_model_results.to_csv("linear_model_submission.csv")

## Random Forest

In [13]:
random_forest = RandomForestRegressor(max_leaf_nodes = 100, random_state = 42)

random_forest.fit(X_train, y_train)

random_forest_predictions = random_forest.predict(X_test)

In [14]:
random_forest_results = pd.DataFrame({"id": test_id, "target": random_forest_predictions})

In [15]:
random_forest_results.to_csv("random_forest_submission.csv")

## XGBoost

In [16]:
xgboost = XGBRegressor(n_estimators = 100, learning_rate = 0.1, random_state = 42)

xgboost.fit(X_train, y_train)

xgboost_predictions = xgboost.predict(X_test)

In [17]:
xgboost_results = pd.DataFrame({"id": test_id, "target": xgboost_predictions})

In [18]:
xgboost_results.to_csv("xgboost_submission.csv")

# More In Depth Modeling

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [20]:
pipeline = Pipeline(steps = [
    ("standardize", StandardScaler()),
    ("xgboost", XGBRegressor())
])

In [21]:
params = {
    "xgboost__n_estimators": [100, 250],
    "xgboost__learning_rate": [0.01, 0.1]
}

In [22]:
search = GridSearchCV(pipeline, param_grid = params, n_jobs = -1, cv = 3, scoring = "neg_mean_squared_error")

In [23]:
search.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardize', StandardScaler()),
                                       ('xgboost',
                                        XGBRegressor(base_score=None,
                                                     booster=None,
                                                     colsample_bylevel=None,
                                                     colsample_bynode=None,
                                                     colsample_bytree=None,
                                                     gamma=None, gpu_id=None,
                                                     importance_type='gain',
                                                     interaction_constraints=None,
                                                     learning_rate=None,
                                                     max_delta_step=None,
                                                     max_depth=None,
                                    

In [24]:
print("The best MSE is " + str(-1 * search.best_score_))

The best MSE is 0.4919723401521385


In [25]:
search.best_estimator_

Pipeline(steps=[('standardize', StandardScaler()),
                ('xgboost',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, gamma=0, gpu_id=-1,
                              importance_type='gain',
                              interaction_constraints='', learning_rate=0.1,
                              max_delta_step=0, max_depth=6, min_child_weight=1,
                              missing=nan, monotone_constraints='()',
                              n_estimators=250, n_jobs=16, num_parallel_tree=1,
                              random_state=0, reg_alpha=0, reg_lambda=1,
                              scale_pos_weight=1, subsample=1,
                              tree_method='exact', validate_parameters=1,
                              verbosity=None))])

In [26]:
xgboost_preds_cv = search.predict(X_test)
print(xgboost_preds_cv)

[7.9287033 7.8512063 7.9419    ... 8.149549  8.050108  7.9474397]


In [27]:
xgboost_cv_results = pd.DataFrame({"id": test_id, "target": xgboost_preds_cv})

In [28]:
xgboost_cv_results.to_csv("xgboost_cv_submission.csv")

## Neural Network

In [29]:
from sklearn.neural_network import MLPRegressor

In [30]:
pipeline = Pipeline(steps = [
    ("standardize", StandardScaler()),
    ("nn", MLPRegressor())
])

In [31]:
params = {
    "nn__activation": ["relu"],
    "nn__solver": ["sgd", "adam"],
    "nn__learning_rate": ["constant", "adaptive"],
    "nn__max_iter": [150, 100]
}

In [32]:
clf = GridSearchCV(pipeline, param_grid = params, cv = 3, scoring = "neg_mean_squared_error" )

In [40]:
clf.fit(X_train, y_train)



GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardize', StandardScaler()),
                                       ('nn', MLPRegressor())]),
             param_grid={'nn__activation': ['relu'],
                         'nn__learning_rate': ['constant', 'adaptive'],
                         'nn__max_iter': [150, 100],
                         'nn__solver': ['sgd', 'adam']},
             scoring='neg_mean_squared_error')

In [41]:
nn_predictions = clf.predict(X_test)
print(nn_predictions)

[7.99609859 7.82963742 8.04627313 ... 8.04903388 8.00253154 7.81815114]


In [42]:
nn_results = pd.DataFrame({"id": test_id, "target": nn_predictions})

In [43]:
nn_results.to_csv("nn_submission.csv")

# Average Predictions of NN, XGBoost with GridSearch, and RandomForest

In [44]:
average_predictions = (nn_predictions + xgboost_preds_cv  + random_forest_predictions) / 3
print(average_predictions)

[8.02215063 7.84047624 7.93074235 ... 8.0725205  8.01841533 7.87351931]


In [45]:
averaged_results = pd.DataFrame({"id": test_id, "target": average_predictions})

In [46]:
averaged_results.to_csv("averaged_model_submission.csv")