In [127]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV, KFold

### Preprocessing

In [95]:
data = pd.read_csv("imputed_data.csv")

In [96]:
data

Unnamed: 0.1,Unnamed: 0,id,x001,x002,x003,x004,x005,x006,x007,x008,...,x757,x758,x759,x760,x761,x762,x763,x764,x765,y
0,0,0.0,9.681860e+10,6991.15,7.76,0.00380,5.378811e+09,0.31,266117.20,934577.0,...,0.0007,2.972810e+08,0.13,5.0,5.0,2.0,8.5127,14.28,-0.750000,5.0
1,1,1.0,3.304810e+09,13914.43,5.37,0.00015,1.652405e+09,0.00,11927742.92,1798051.0,...,0.1136,3.320000e+12,0.08,661.0,0.0,350.0,1.5700,160.12,0.706143,1.0
2,2,2.0,3.218944e+10,3991.98,5.77,0.00010,2.476111e+09,0.00,774385.01,375738.0,...,0.0029,1.004748e+08,0.39,39.0,2.0,18.0,9.6800,25.06,-0.490000,11.0
3,3,3.0,1.288000e+10,15937.45,5.86,0.00020,2.146667e+09,0.00,6324375.16,1932094.0,...,0.0000,3.480000e+11,0.25,2.0,1.0,0.0,4.5316,117.76,1.640000,1.0
4,4,4.0,3.063412e+10,3621.00,7.52,0.00060,1.392460e+09,0.21,169860.29,474253.0,...,0.0005,1.095466e+08,0.11,11.0,1.0,3.0,16.2717,5.81,-0.420000,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5375,5375,5375.0,3.948791e+09,24563.46,6.73,0.00035,9.871977e+08,0.43,3303184.55,3154159.0,...,0.0000,1.586033e+08,0.05,0.0,0.0,0.0,2.7480,93.45,0.220000,4.0
5376,5376,5376.0,9.279017e+10,21572.94,6.96,0.00120,3.093006e+09,0.30,2649164.57,2934417.0,...,0.0003,3.608917e+07,0.01,6.0,4.0,4.0,23.6890,76.05,-0.900000,8.0
5377,5377,5377.0,2.700359e+10,23061.73,6.36,0.00065,3.857656e+09,0.35,1825306.07,2395841.0,...,0.0057,1.786891e+06,0.53,44.0,0.0,28.0,4.3710,80.30,-0.700000,21.0
5378,5378,5378.0,4.351107e+10,5739.04,7.80,0.00065,1.318517e+09,0.29,144103.12,715173.0,...,0.0001,1.940000e+11,0.29,3.0,2.0,2.0,24.6594,7.95,0.470000,13.0


In [130]:
sc = StandardScaler()
X = sc.fit_transform(data.drop(columns = ['Unnamed: 0', 'id', 'y']))
y_train = data.y

In [98]:
pca = PCA(n_components = 0.99, svd_solver = 'full')
X_pca = pca.fit_transform(X)

In [99]:
Lasso = LassoCV(cv = 5, random_state = 0, max_iter = 1000).fit(X_pca, y)

In [100]:
Lasso.alpha_

0.05745956533015913

In [101]:
np.where(Lasso.coef_ != 0)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  72,  74,  75,  76,  77,  78,  80,
         81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,
         94,  96,  98, 100, 102, 105, 106, 108, 109, 111, 112, 113, 114,
        116, 117, 119, 120, 122, 123, 125, 126, 127, 128, 129, 130, 131,
        133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
        147, 148, 149, 150, 152, 153, 154, 158, 159, 160, 161, 162, 163,
        164, 165, 166, 167, 168, 169, 170, 171, 173, 174, 175, 177, 178,
        179, 180, 181, 182, 183, 184, 185, 186, 188, 190, 191, 193, 194,
        196, 198, 199, 200, 201, 203, 204, 207, 208

In [110]:
X_pca_lasso = X_pca[:,np.where(Lasso.coef_ != 0)[0]]

In [122]:
X_pca_lasso.shape

(5380, 188)

### Support Vector Machine Regressor

In [111]:
X_test = pd.read_csv("imputed_test.csv")

In [114]:
X_test.drop(columns = ['Unnamed: 0', 'id'], inplace = True)

In [123]:
X_test = sc.transform(X_test)[:, np.where(Lasso.coef_ != 0)[0]]

In [124]:
from sklearn.svm import SVR

In [126]:
svr = SVR()

In [132]:
hyperparameter_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['poly', 'rbf'],
    'degree': [2, 3, 4],
    'gamma':[0.001, 0.01, 0.1, 'scale', 'auto'] 
}

gscv = GridSearchCV(svr, hyperparameter_grid, scoring = 'neg_root_mean_squared_error',
                    cv = KFold(n_splits = 5, shuffle = True, random_state = 0), verbose = 2)

In [133]:
gscv.fit(X_pca_lasso, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] END .........C=0.01, degree=2, gamma=0.001, kernel=poly; total time=   3.0s
[CV] END .........C=0.01, degree=2, gamma=0.001, kernel=poly; total time=   2.8s
[CV] END .........C=0.01, degree=2, gamma=0.001, kernel=poly; total time=   2.8s
[CV] END .........C=0.01, degree=2, gamma=0.001, kernel=poly; total time=   2.7s
[CV] END .........C=0.01, degree=2, gamma=0.001, kernel=poly; total time=   2.8s
[CV] END ..........C=0.01, degree=2, gamma=0.001, kernel=rbf; total time=   3.6s
[CV] END ..........C=0.01, degree=2, gamma=0.001, kernel=rbf; total time=   3.8s
[CV] END ..........C=0.01, degree=2, gamma=0.001, kernel=rbf; total time=   3.6s
[CV] END ..........C=0.01, degree=2, gamma=0.001, kernel=rbf; total time=   3.9s
[CV] END ..........C=0.01, degree=2, gamma=0.001, kernel=rbf; total time=   3.6s
[CV] END ..........C=0.01, degree=2, gamma=0.01, kernel=poly; total time=   2.9s
[CV] END ..........C=0.01, degree=2, gamma=0.0

[CV] END .........C=0.01, degree=4, gamma=0.001, kernel=poly; total time=   2.8s
[CV] END .........C=0.01, degree=4, gamma=0.001, kernel=poly; total time=   2.9s
[CV] END .........C=0.01, degree=4, gamma=0.001, kernel=poly; total time=   2.9s
[CV] END .........C=0.01, degree=4, gamma=0.001, kernel=poly; total time=   2.9s
[CV] END ..........C=0.01, degree=4, gamma=0.001, kernel=rbf; total time=   3.5s
[CV] END ..........C=0.01, degree=4, gamma=0.001, kernel=rbf; total time=   3.6s
[CV] END ..........C=0.01, degree=4, gamma=0.001, kernel=rbf; total time=   3.5s
[CV] END ..........C=0.01, degree=4, gamma=0.001, kernel=rbf; total time=   3.5s
[CV] END ..........C=0.01, degree=4, gamma=0.001, kernel=rbf; total time=   3.5s
[CV] END ..........C=0.01, degree=4, gamma=0.01, kernel=poly; total time=   3.8s
[CV] END ..........C=0.01, degree=4, gamma=0.01, kernel=poly; total time=   3.9s
[CV] END ..........C=0.01, degree=4, gamma=0.01, kernel=poly; total time=   3.7s
[CV] END ..........C=0.01, d

[CV] END ..........C=0.1, degree=3, gamma=0.001, kernel=poly; total time=   3.3s
[CV] END ..........C=0.1, degree=3, gamma=0.001, kernel=poly; total time=   3.2s
[CV] END ...........C=0.1, degree=3, gamma=0.001, kernel=rbf; total time=   4.0s
[CV] END ...........C=0.1, degree=3, gamma=0.001, kernel=rbf; total time=   3.8s
[CV] END ...........C=0.1, degree=3, gamma=0.001, kernel=rbf; total time=   4.1s
[CV] END ...........C=0.1, degree=3, gamma=0.001, kernel=rbf; total time=   3.9s
[CV] END ...........C=0.1, degree=3, gamma=0.001, kernel=rbf; total time=   3.8s
[CV] END ...........C=0.1, degree=3, gamma=0.01, kernel=poly; total time=   4.8s
[CV] END ...........C=0.1, degree=3, gamma=0.01, kernel=poly; total time=   4.8s
[CV] END ...........C=0.1, degree=3, gamma=0.01, kernel=poly; total time=   4.9s
[CV] END ...........C=0.1, degree=3, gamma=0.01, kernel=poly; total time=   5.1s
[CV] END ...........C=0.1, degree=3, gamma=0.01, kernel=poly; total time=   4.5s
[CV] END ............C=0.1, 

[CV] END .............C=1, degree=2, gamma=0.001, kernel=rbf; total time=   3.6s
[CV] END .............C=1, degree=2, gamma=0.001, kernel=rbf; total time=   3.7s
[CV] END .............C=1, degree=2, gamma=0.001, kernel=rbf; total time=   3.6s
[CV] END .............C=1, degree=2, gamma=0.001, kernel=rbf; total time=   3.6s
[CV] END .............C=1, degree=2, gamma=0.001, kernel=rbf; total time=   3.7s
[CV] END .............C=1, degree=2, gamma=0.01, kernel=poly; total time=   6.9s
[CV] END .............C=1, degree=2, gamma=0.01, kernel=poly; total time=   7.0s
[CV] END .............C=1, degree=2, gamma=0.01, kernel=poly; total time=   6.3s
[CV] END .............C=1, degree=2, gamma=0.01, kernel=poly; total time=   6.2s
[CV] END .............C=1, degree=2, gamma=0.01, kernel=poly; total time=   6.7s
[CV] END ..............C=1, degree=2, gamma=0.01, kernel=rbf; total time=   3.6s
[CV] END ..............C=1, degree=2, gamma=0.01, kernel=rbf; total time=   3.7s
[CV] END ..............C=1, 

[CV] END .............C=1, degree=4, gamma=0.001, kernel=rbf; total time=   3.4s
[CV] END .............C=1, degree=4, gamma=0.001, kernel=rbf; total time=   3.5s
[CV] END .............C=1, degree=4, gamma=0.001, kernel=rbf; total time=   3.6s
[CV] END .............C=1, degree=4, gamma=0.01, kernel=poly; total time=   5.9s
[CV] END .............C=1, degree=4, gamma=0.01, kernel=poly; total time=   6.5s
[CV] END .............C=1, degree=4, gamma=0.01, kernel=poly; total time=   6.3s
[CV] END .............C=1, degree=4, gamma=0.01, kernel=poly; total time=   6.4s
[CV] END .............C=1, degree=4, gamma=0.01, kernel=poly; total time=   5.5s
[CV] END ..............C=1, degree=4, gamma=0.01, kernel=rbf; total time=   3.8s
[CV] END ..............C=1, degree=4, gamma=0.01, kernel=rbf; total time=   3.8s
[CV] END ..............C=1, degree=4, gamma=0.01, kernel=rbf; total time=   4.6s
[CV] END ..............C=1, degree=4, gamma=0.01, kernel=rbf; total time=   4.4s
[CV] END ..............C=1, 

[CV] END ............C=10, degree=3, gamma=0.001, kernel=rbf; total time=   4.3s
[CV] END ............C=10, degree=3, gamma=0.01, kernel=poly; total time=   6.9s
[CV] END ............C=10, degree=3, gamma=0.01, kernel=poly; total time=   6.3s
[CV] END ............C=10, degree=3, gamma=0.01, kernel=poly; total time=   6.6s
[CV] END ............C=10, degree=3, gamma=0.01, kernel=poly; total time=   6.4s
[CV] END ............C=10, degree=3, gamma=0.01, kernel=poly; total time=   6.3s
[CV] END .............C=10, degree=3, gamma=0.01, kernel=rbf; total time=   4.0s
[CV] END .............C=10, degree=3, gamma=0.01, kernel=rbf; total time=   4.1s
[CV] END .............C=10, degree=3, gamma=0.01, kernel=rbf; total time=   3.7s
[CV] END .............C=10, degree=3, gamma=0.01, kernel=rbf; total time=   4.0s
[CV] END .............C=10, degree=3, gamma=0.01, kernel=rbf; total time=   4.9s
[CV] END .............C=10, degree=3, gamma=0.1, kernel=poly; total time=   7.5s
[CV] END .............C=10, 

In [139]:
gscv.best_params_

{'C': 10, 'degree': 2, 'gamma': 0.001, 'kernel': 'rbf'}

In [136]:
svm_args = gscv.best_params_

In [137]:
svm_tuned = SVR(**svm_args)

In [138]:
svm_tuned.fit(X_pca_lasso, y_train)

In [142]:
ypred_svm = svm_tuned.predict(X_test)

In [160]:
ID = pd.read_csv("imputed_test.csv").id.astype(np.int32)

In [161]:
y_pred = pd.DataFrame(ypred_svm, index = ID)

In [162]:
y_pred.columns = ["y"]

In [164]:
y_pred.to_csv("SVM_Kaggle.csv")

Score: 11.3426