In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.cluster import DBSCAN
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn import preprocessing
import xgboost as xgb

In [None]:
# Step 0: Load the proper csvs

xtrain = pd.read_csv("X_train.csv")
xtrain = xtrain.iloc[:, 1:]
xtest = pd.read_csv("X_test.csv")
xtest = xtest.iloc[:, 1:]
ytrain = pd.read_csv("Y_train.csv")

In [None]:
# Step 1: Treat missing values
# Treat missing values as column medians. Important (apparently) to use the medians from the training set in the test set

xtrain = xtrain.fillna(xtrain.median())
xtest = xtest.fillna(xtrain.median())

In [None]:
# Step 2: Scale the training and test data

scaler = preprocessing.StandardScaler()

xtrain_scaled = scaler.fit_transform(xtrain)
xtrain = pd.DataFrame(xtrain_scaled, columns = xtrain.columns)
xtest_scaled = scaler.fit_transform(xtest)
xtest = pd.DataFrame(xtest_scaled, columns = xtest.columns)

In [None]:
# Step 3: Remove unnecessary id column from ytrain that just ***** things up

ytrain1 = ytrain.loc[:, "y"]
ytrain2 = pd.DataFrame(data = ytrain1.values, columns= ['y'])
ytrain = ytrain2

In [None]:
#reshuffling data

xtrain['y'] = ytrain
xtrain = xtrain.sample(frac=1).reset_index(drop=True)
ytrain = xtrain['y']
xtrain = xtrain.drop('y', axis = 1)

In [None]:
# Outlier detection with local outlier factor
from sklearn.neighbors import LocalOutlierFactor

clf = LocalOutlierFactor(n_neighbors=700, contamination=0.1)
outliers = clf.fit_predict(xtrain)

# Remove outliers from xtrain and ytrain


outliers = outliers == 1
print(np.count_nonzero(outliers))
xtrain = xtrain[outliers]
ytrain = ytrain[outliers]

In [None]:
'''

#Feature selection  -- cross validated inside the loop below
# option 1  
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
k_best = 500
xtrain = SelectKBest(mutual_info_regression, k=k_best).fit_transform(xtrain, ytrain)


# option 2
sel = SelectFromModel(RandomForestClassifier(n_estimators = 200))
sel.fit(xtrain, ytrain)
selected_feat = xtrain.columns[(sel.get_support())]
xtrain = xtrain.loc[:,selected_feat]
xtest = xtest.loc[:,selected_feat]
'''


In [None]:
pd.DataFrame(xtrain).head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

folds = 10

cv_mean_scores = []
cv_stds = []
estimators = np.array([100])
alphas = np.array([0.01])
mutual_features = np.array([50])
max_depth = np.array([5, 10])



for n in estimators:
    for best in mutual_features:
        for a in alphas: 
            for depth in max_depth: 
                
                xtrain = SelectKBest(mutual_info_regression, k = best).fit_transform(xtrain, ytrain)
                r1 = RandomForestRegressor(n_estimators= n, random_state = 42, max_depth= depth, bootstrap = True)
                r2 = linear_model.Lasso(alpha= a)
                r3 = xgb.XGBRegressor(objective  = "reg:squarederror", random_state = 42, n_estimators=200)
                er = VotingRegressor([('rf', r1), ('lasso', r2), ('xgb', r3)])
                scores = cross_val_score(estimator = er, X = xtrain,
                                   y = ytrain, scoring = 'r2', cv = folds)
                cv_mean_scores.append(np.mean(scores))   
                cv_stds.append(np.std(scores))

    
print(cv_mean_scores)
print(cv_stds)

In [None]:
cv_mean_scores