In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.cluster import DBSCAN
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn import preprocessing
import xgboost as xgb

In [10]:
# Step 0: Load the proper csvs

xtrain = pd.read_csv("X_train.csv")
xtrain = xtrain.iloc[:, 1:]
xtest = pd.read_csv("X_test.csv")
xtest = xtest.iloc[:, 1:]
ytrain = pd.read_csv("Y_train.csv")

In [11]:
# Step 1: Treat missing values
# Treat missing values as column medians. Important (apparently) to use the medians from the training set in the test set

xtrain = xtrain.fillna(xtrain.median())
xtest = xtest.fillna(xtrain.median())

In [12]:
# Step 2: Scale the training and test data

scaler = preprocessing.StandardScaler()

xtrain_scaled = scaler.fit_transform(xtrain)
xtrain = pd.DataFrame(xtrain_scaled, columns = xtrain.columns)
xtest_scaled = scaler.fit_transform(xtest)
xtest = pd.DataFrame(xtest_scaled, columns = xtest.columns)

In [13]:
# Step 3: Remove unnecessary id column from ytrain that just ***** things up

ytrain1 = ytrain.loc[:, "y"]
ytrain2 = pd.DataFrame(data = ytrain1.values, columns= ['y'])
ytrain = ytrain2

In [6]:
# Remove features with zero weight from Lasso

clf = Lasso(alpha=0.3)
clf.fit(xtrain, ytrain)

features = (clf.coef_ != 0)

xtrain = xtrain.loc[:, features]
xtest = xtest.loc[:, features]

In [15]:
# Run random forest to select features

sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(xtrain, ytrain)
sel.get_support()

print(np.count_nonzero(sel.get_support()))

selected_feat = xtrain.columns[(sel.get_support())]
xtrain = xtrain.loc[:,selected_feat]
xtest = xtest.loc[:,selected_feat]

  self.estimator_.fit(X, y, **fit_params)


187


In [16]:



# Do weird Andreas PCA thing

from sklearn import decomposition
import matplotlib.pyplot as plt
pca = decomposition.PCA(n_components=2)
principal_components = pca.fit_transform(xtrain)
xpca = pd.DataFrame(data = principal_components, columns = ['PC1', 'PC2'])

outliers = (xpca['PC1'] <= 9) & (xpca['PC1'] >= -8) & (xpca['PC2'] <= 7) & (xpca['PC2'] >= -7)
print(np.count_nonzero(outliers))
xtrain = xtrain[outliers]
ytrain = ytrain[outliers]

'''

# Outlier detection with local outlier factor
from sklearn.neighbors import LocalOutlierFactor

clf = LocalOutlierFactor(n_neighbors=700, contamination=0.08)
outliers = clf.fit_predict(xtrain)

# Remove outliers from xtrain and ytrain


outliers = outliers == 1
print(np.count_nonzero(outliers))
xtrain = xtrain[outliers]
ytrain = ytrain[outliers]

'''

1130


'\n\n# Outlier detection with local outlier factor\nfrom sklearn.neighbors import LocalOutlierFactor\n\nclf = LocalOutlierFactor(n_neighbors=700, contamination=0.08)\noutliers = clf.fit_predict(xtrain)\n\n# Remove outliers from xtrain and ytrain\n\n\noutliers = outliers == 1\nprint(np.count_nonzero(outliers))\nxtrain = xtrain[outliers]\nytrain = ytrain[outliers]\n\n'

In [17]:
# k-fold cross validation evaluation of xgboost model

from numpy import loadtxt
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from scipy import stats
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold


clf = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)



cv_means = []
cv_stds = []
for i in np.arange(10):
    scores = cross_val_score(estimator = clf,
                                 X = xtrain,
                                 y = ytrain,
                                 scoring = 'r2',
                                 cv = KFold(n_splits=5, shuffle = True))
    cv_means.append(np.mean(scores))
    cv_stds.append(np.std(scores))

print(cv_means)
print(cv_stds)
print(np.mean(cv_means))
print(np.mean(cv_stds))

[0.5843780393241074, 0.6051466512254278, 0.5792160005648814, 0.5884335370314853, 0.5796901589253641, 0.5812863890903672, 0.5939001075487136, 0.5952017494319561, 0.6037976861195885, 0.5999812160154778]
[0.03406034733961235, 0.02788155696743091, 0.028884326591303754, 0.019336394503352606, 0.04207427197152635, 0.05610323858298973, 0.03471751035931504, 0.03270662885253892, 0.0434266791798503, 0.04002556940167263]
0.5911031535277369
0.03592165237495926


LOF 10 folds with 10 iterations:
Mean of meanscores: 0.542
Mean of stdscores: 0.0778
Public score: 

Standard XGB 5 folds with 10 iterations:
Mean of meanscores: 0.514
Mean of stdscores: 0.0487
Public score: 0.656859408726

LOF XGB 5 folds with 10 iterations:
Mean of meanscores: 0.534
Mean of stdscores: 0.0491
Public Score: 0.596

PCA XGB 5 folds with 10 iterations:
Mean of meanscores: 0.541
Mean of stdscores: 0.0381
Public score: 0.613

Lasso feats XGB 5 folds with 10 iterations:
Mean of meanscores: 0.529
Mean of stdscores: 0.0489
Public score: 0.621

Lasso feats LOF XGB 5 folds with 10 iterations:
Mean of meanscores: 0.571
Mean of stdscores: 0.0287
Public score: 0.628

Lasso feats PCA XGB 5 folds with 10 iterations:
Mean of meanscores: 0.642
Mean of stdscores: 0.0307
Public score: 0.650

Random Forest (x3: 86 feats left) XGB 5 folds with 10 iterations:
Mean of meanscores: 0.504
Mean of stdscores: 0.0734
Public score: 

Random Forest (x3: 96 feats left) LOF XGB 5 folds with 10 iterations:
Mean of meanscores: 0.504
Mean of stdscores: 0.0605
Public score:

Random Forest (x3: 80 feats left) PCA XGB 5 folds with 10 iterations:
Mean of meanscores: 0.555
Mean of stdscores: 0.0479
Public score:

Random Forest (x2: 187 feats left) PCA XGB 5 folds with 10 iterations:
Mean of meanscores: 0.591
Mean of stdscores: 0.0359
Public score: 0.647

In [18]:
# Save to csv

clf = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
clf.fit(xtrain, ytrain)

y_pred = clf.predict(xtest)

index = pd.read_csv("sample.csv")
index['y'] = y_pred

index.to_csv("predictionsRF2PCAXGB.csv")