In [82]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.cluster import DBSCAN
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn import preprocessing
import xgboost as xgb
from sklearn.ensemble import VotingRegressor
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn import linear_model
import warnings  
warnings.filterwarnings('ignore')



In [83]:
# Step 0: Load the proper csvs

xtrain = pd.read_csv("X_train.csv")
xtrain = xtrain.iloc[:, 1:]
xtest = pd.read_csv("X_test.csv")
xtest = xtest.iloc[:, 1:]
ytrain = pd.read_csv("Y_train.csv")

In [84]:
# Step 1: Treat missing values
# Treat missing values as column medians. Important (apparently) to use the medians from the training set in the test set

xtrain = xtrain.fillna(xtrain.median())
xtest = xtest.fillna(xtrain.median())

In [85]:
# Step 2: Scale the training and test data

scaler = preprocessing.StandardScaler()

xtrain_scaled = scaler.fit_transform(xtrain)
xtrain = pd.DataFrame(xtrain_scaled, columns = xtrain.columns)
xtest_scaled = scaler.fit_transform(xtest)
xtest = pd.DataFrame(xtest_scaled, columns = xtest.columns)

In [86]:
# Step 3: Remove unnecessary id column from ytrain 

ytrain1 = ytrain.loc[:, "y"]
ytrain2 = pd.DataFrame(data = ytrain1.values, columns= ['y'])
ytrain = ytrain2

In [87]:
# reshuffling data

xtrain['y'] = ytrain
xtrain = xtrain.sample(frac=1).reset_index(drop=True)
ytrain = xtrain['y']
xtrain = xtrain.drop('y', axis = 1)

In [88]:
# Outlier detection with local outlier factor
from sklearn.neighbors import LocalOutlierFactor

clf = LocalOutlierFactor(n_neighbors=700, contamination=0.1)
outliers = clf.fit_predict(xtrain)

# Remove outliers from xtrain and ytrain


outliers = outliers == 1
print(np.count_nonzero(outliers))
xtrain_ensemble = xtrain[outliers]
ytrain_ensemble = ytrain[outliers]

1090


In [89]:
'''

#Feature selection  -- cross validated inside the loop below
# option 1  
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
k_best = 500
xtrain = SelectKBest(mutual_info_regression, k=k_best).fit_transform(xtrain, ytrain)


# option 2
sel = SelectFromModel(RandomForestClassifier(n_estimators = 200))
sel.fit(xtrain, ytrain)
selected_feat = xtrain.columns[(sel.get_support())]
xtrain = xtrain.loc[:,selected_feat]
xtest = xtest.loc[:,selected_feat]
'''


'\n\n#Feature selection  -- cross validated inside the loop below\n# option 1  \nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.feature_selection import mutual_info_regression\nk_best = 500\nxtrain = SelectKBest(mutual_info_regression, k=k_best).fit_transform(xtrain, ytrain)\n\n\n# option 2\nsel = SelectFromModel(RandomForestClassifier(n_estimators = 200))\nsel.fit(xtrain, ytrain)\nselected_feat = xtrain.columns[(sel.get_support())]\nxtrain = xtrain.loc[:,selected_feat]\nxtest = xtest.loc[:,selected_feat]\n'

In [None]:
# Cross validating ensemble

folds = 5

cv_mean_scores = []
cv_stds = []
estimators = np.array([200, 350, 400])
mutual_features = np.array([70, 150, 250, 350])
alphas = np.array([0.01, 0.03, 0.1, 0.3])
max_depth = np.array([20, 40, 60])

for n in estimators:
    for best in mutual_features:
        for a in alphas: 
            for depth in max_depth: 
                
                xtrain = SelectKBest(mutual_info_regression, k = best).fit_transform(xtrain_ensemble, ytrain_ensemble)
                r1 = RandomForestRegressor(n_estimators = n, random_state = 42, max_depth = depth, bootstrap = True)
                r2 = linear_model.Lasso(alpha= a)
                r3 = xgb.XGBRegressor(objective  = "reg:squarederror", random_state = 52, n_estimators=200)
                er = VotingRegressor([('rf', r1), ('lasso', r2), ('xgb', r3)])
                scores = cross_val_score(estimator = er, X = xtrain,
                                   y = ytrain_ensemble, scoring = 'r2', cv = folds)
                cv_mean_scores.append(np.mean(scores))   
                cv_stds.append(np.std(scores))
                print("Mean Validation score {}".format(np.mean(scores)))
                print("Std Validation score {}".format(np.std(scores)))
                
        


Mean Validation score 0.5356786531562483
Std Validation score 0.08528096198344745
Mean Validation score 0.5332099906225032
Std Validation score 0.08357174327351075
Mean Validation score 0.5336738546691011
Std Validation score 0.07826274176003374
Mean Validation score 0.5321438557847966
Std Validation score 0.0851585291162828
Mean Validation score 0.5342665677948756
Std Validation score 0.08185473810553937
Mean Validation score 0.5369356969756409
Std Validation score 0.0892526340588144
Mean Validation score 0.5380390935356321
Std Validation score 0.08788292397292127
Mean Validation score 0.5297738909899686
Std Validation score 0.07598787562151886
Mean Validation score 0.5302298095070652
Std Validation score 0.0862223834868987
Mean Validation score 0.5263735507693845
Std Validation score 0.0782306866907978
Mean Validation score 0.5298764247270504
Std Validation score 0.09277977945929676
Mean Validation score 0.5236942613778266
Std Validation score 0.07720090790747172
Mean Validation scor

In [None]:
fig = plt.figure()
ax = plt.axes()
ax.plot(cv_mean_scores)

fig = plt.figure()
ax = plt.axes()
ax.plot(cv_stds)

In [13]:
#ensemble model with parameters chosen by max CV above -- to be adjusted after complete run 
folds = 5

xtrain = SelectKBest(mutual_info_regression, k = 100).fit_transform(xtrain_ensemble, ytrain_ensemble)
r1 = RandomForestRegressor(n_estimators = 200, random_state = 42, max_depth = 30, bootstrap = True)
r2 = linear_model.Lasso(alpha= 0.01)
r3 = xgb.XGBRegressor(objective  = "reg:squarederror", random_state = 42, n_estimators=200)
er = VotingRegressor([('rf', r1), ('lasso', r2), ('xgb', r3)])
scores = cross_val_score(estimator = er, X = xtrain,
    y = ytrain_ensemble, scoring = 'r2', cv = folds)



In [14]:
er.fit(xtrain_ensemble, ytrain_ensemble)
predict = er.predict(X = xtest)
index = pd.read_csv("sample.csv")
index['y'] = predict

index.to_csv("ensemble.csv")