In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor, plot_tree
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFECV, VarianceThreshold
pd.options.display.max_columns = None

In [2]:
def wmae(y_true, y_pred):
    y_true_sum = sum(y_true)
    return sum([(y_true[i] / y_true_sum) * abs(y_true[i] - y_pred[i]) for i in range(len(y_true))])

scorer = make_scorer(wmae, greater_is_better=False)

In [3]:
def grid_select_topdown(estimator, X, y, cv=5, scoring=None, n_jobs=None):
    from sklearn.model_selection import GridSearchCV
    num_feature = len(X.columns)
    max_set = list(X.columns)
    gs = GridSearchCV(estimator, {}, cv=cv, scoring=scoring, n_jobs=n_jobs)
    gs.fit(X.loc[:, max_set], y.values)
    max_score = gs.best_score_
    
    while num_feature >= 1:
        print('num_features :', num_feature)
        score_list = []
        local_max = -1e+10
        
        for i in range(len(X.columns)):
            feature_set = max_set.copy()
            if X.columns[i] not in feature_set:
                continue
            feature_set.remove(X.columns[i])
            gs = GridSearchCV(estimator, {}, cv=cv, scoring=scoring, n_jobs=n_jobs, iid=False)
            gs.fit(X.loc[:, feature_set], y.values)
            score_list.append((gs.best_score_, X.columns[i]))
            
        local_max = max(score_list)
        if local_max[0] >= max_score:
            max_score = local_max[0]
            max_set.remove(local_max[1])
            num_feature -= 1
        else:
            break
    return max_set, max_score

In [74]:
def grid_select_bottomup(estimator, X, y, cv=5, scoring=None, n_jobs=None, bottom=[]):
    from sklearn.model_selection import GridSearchCV
    bottom = list(bottom)
    num_feature = len(bottom) + 1
    max_set = bottom
    if bottom:
        gs = GridSearchCV(estimator, {}, cv=cv, scoring=scoring, n_jobs=n_jobs, iid=False)
        gs.fit(X.loc[:, max_set], y.values)
        max_score = gs.best_score_
    else:
        max_score = -1e+10
    
    while num_feature <= len(X.columns):
        print('num_features :', num_feature)
        score_list = []
        local_max = -1e+10
        
        for i in range(len(X.columns)):
            feature_set = max_set.copy()
            if X.columns[i] in feature_set:
                continue
            feature_set.append(X.columns[i])
            gs = GridSearchCV(estimator, {}, cv=cv, scoring=scoring, n_jobs=n_jobs, iid=False)
            gs.fit(X.loc[:, feature_set], y.values)
            score_list.append((gs.best_score_, X.columns[i]))
            
        local_max = max(score_list)
        if local_max[0] > max_score:
            max_score = local_max[0]
            max_set.append(local_max[1])
            num_feature += 1
        else:
            break
    return max_set, max_score

In [5]:
data = pd.read_csv('qualityData.csv')
data.head()

Unnamed: 0,Time_Sequence,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36,x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49,x50,x51,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61,x62,x63,x64,x65,x66,x67,x68,x69,x70,x71,x72,x73,x74,x75,x76,x77,x78,x79,x80,x81,x82,x83,x84,x85,Y
0,1,0.028837,-0.649824,0.363124,-0.752145,-0.631316,-0.588815,-0.9042,-0.577024,0.715939,0.87363,-0.063285,1.386803,1.369457,-1.415962,0.705534,0.577331,0.577289,-1.406626,-1.175603,1.350135,1.394348,1.819398,-0.567154,-1.739398,-0.577338,-1.446718,0.707884,0.171637,-1.576567,0.123278,-0.204355,1.195427,1.065614,1.260149,-1.900306,-0.725292,1.705609,1.77436,0.996925,-0.969366,2.542208,1.619811,0.664243,0.863402,-0.789514,-0.093108,1.324868,0.129826,1.218616,-0.610184,-0.53957,0.949478,-1.390485,1.000363,0.121672,0.710032,-0.511271,0.87363,0.01319,-0.160156,0.171637,3.695098,3.756213,-0.631316,-0.588815,-1.562243,-1.446718,1.819398,0.996925,-0.567154,-1.406626,-1.415962,-0.696711,-0.577338,-1.739398,-0.725292,1.394348,1.350135,1.77436,1.741823,1.705609,-1.175603,0.705534,1.350135,0.625413,2.185269
1,2,0.053882,-0.649824,0.347376,-0.499595,-0.528668,-0.487181,-0.9042,-0.576967,0.702009,-0.04018,0.564461,1.386803,1.368857,-1.780203,0.810728,0.577331,0.577356,-1.212648,-2.588321,1.404929,1.27781,1.952918,-0.540794,-1.928242,-0.577302,-1.560255,0.686913,0.171566,-1.590414,0.1438,-0.181573,1.194008,1.065614,1.255343,-1.949902,-0.991793,1.705365,1.701051,1.38738,-1.066326,2.542208,1.647681,0.633059,0.81111,-0.629883,-0.093821,1.323981,0.147267,1.220986,-0.587714,1.752507,0.943352,-1.390505,1.000455,0.121567,0.70754,-0.513719,-0.04018,-1.44649,3.658466,0.171566,3.694262,3.752843,-0.528668,-0.487181,-1.562243,-1.560255,1.952918,1.38738,-0.540794,-1.212648,-1.780203,-0.007904,-0.577302,-1.928242,-0.991793,1.27781,1.404929,1.701051,1.469954,1.705365,-2.588321,0.810728,1.404929,0.050509,1.572313
2,3,0.065778,-0.649824,0.347376,-0.732795,-0.586186,-0.578574,-0.9042,-0.577844,0.70379,-0.423391,0.077638,1.386803,1.396297,-1.390842,0.747611,0.577331,0.57733,-1.298861,-1.371211,1.573875,1.428878,2.059735,-0.830752,-1.793354,-0.577324,-1.568989,0.659779,0.163045,-1.593408,0.120171,-0.190815,1.194717,1.065614,1.257438,-1.781278,-1.125044,1.705609,1.758069,1.069568,-0.96196,2.542208,1.643748,0.629903,0.836281,-0.626171,-0.092185,1.325755,0.143323,1.217369,-0.613808,-0.451269,0.973792,-1.390499,1.000494,0.121604,0.702555,-0.50607,-0.423391,-2.322298,-0.160156,0.163045,3.69468,3.754312,-0.586186,-0.578574,-1.562243,-1.568989,2.059735,1.069568,-0.830752,-1.298861,-1.390842,-0.421188,-0.577324,-1.793354,-1.125044,1.428878,1.573875,1.758069,1.741823,1.705609,-1.371211,0.747611,1.573875,0.625413,2.581797
3,4,0.65307,-1.332022,1.150375,-1.709346,0.47391,1.22086,-0.038151,1.636807,0.448839,-0.7393,-0.071174,1.733044,1.380402,-0.144791,0.402768,-0.03245,0.78149,4.302894,-0.019427,-0.733468,-1.007124,-0.489489,-0.632534,-0.288692,-0.595148,-0.271747,-0.007886,-0.077363,-2.923825,0.221586,1.769661,2.550277,-0.116679,1.523361,1.127379,-0.149942,1.474274,-0.796812,-0.646316,-1.089218,-0.694778,2.90921,-0.020052,2.715818,-1.213122,1.378516,-0.360431,0.037214,-0.538254,-1.810349,1.260306,2.158371,-4.169996,1.128912,-0.000454,1.127538,-0.879555,-0.275507,0.786035,1.529219,1.175247,-0.776958,-0.348555,-0.07932,-1.258747,-1.556757,1.031074,-0.424738,-0.544519,-0.097836,1.596644,-0.201047,-0.469044,-1.087967,-0.157052,0.345591,-0.833474,-1.135911,-0.813025,1.966284,1.263878,-0.059706,-0.511038,-0.885362,0.791103,2.758224
4,5,0.039497,-0.649824,0.326514,-0.361767,-0.602523,-0.636483,-0.9042,-0.576836,0.705902,-0.585141,0.082894,1.386803,1.396198,-1.567649,0.757861,0.577331,0.577316,-1.431495,-2.19655,1.409495,1.426222,1.772495,-0.548905,-1.63045,-0.577346,-1.442127,0.684349,0.158038,-1.581557,0.147419,-0.161116,1.192595,1.065614,1.260727,-1.817139,-1.257156,1.705609,1.936643,1.126611,-1.169466,2.542208,1.62621,0.656214,0.882524,-0.661867,-0.070925,1.327316,0.146976,1.217746,-0.603419,-0.554934,0.959895,-1.390538,1.000508,0.12175,0.744988,-0.509127,-0.585141,0.597061,-0.160156,0.158038,3.69343,3.756618,-0.602523,-0.636483,-1.424307,-1.442127,1.772495,1.126611,-0.548905,-1.431495,-1.567649,-0.834472,-0.577346,-1.63045,-1.257156,1.426222,1.409495,1.936643,1.741823,1.705609,-2.19655,0.757861,1.409495,0.625413,


In [96]:
is_test = data['Y'].isnull()
train = data[~is_test]
test = data[is_test]

In [98]:
best_score = 9999
best_i = -1
for i in range(3000):
    kfold = KFold(10, shuffle=True, random_state=i).split(train)
    means = []
    stds = []
    for j in kfold:
        means.append(data.iloc[j[1]]['Y'].describe()['mean'])
        stds.append(data.iloc[j[1]]['Y'].describe()['std'])
    score = np.array(means).std() + np.array(stds).std()
    if score < best_score:
        best_score = score
        best_i = i
print(best_i)
kfold = KFold(10, shuffle=True, random_state=best_i)

1242


In [99]:
X_train = train.drop('Y', axis=1)
y_train = train.loc[:, 'Y']
X_test = test.drop('Y', axis=1)
y_test = test.loc[:, 'Y']

In [106]:
gs_knn = GridSearchCV(KNeighborsRegressor(p=2), {'n_neighbors':[1,2,3,4,5,6,7], 'weights':['uniform', 'distance']}, scoring=scorer, n_jobs=-1, cv=kfold)
gs_knn.fit(X_train, y_train.values)
gs_knn.best_score_, gs_knn.best_params_

(-0.6391740000403979, {'n_neighbors': 1, 'weights': 'uniform'})

In [108]:
knn_set, knn_score = grid_select_topdown(KNeighborsRegressor(n_neighbors=1, weights='uniform', p=2), X_train, y_train, cv=kfold, scoring=scorer, n_jobs=-1)
knn_set, knn_score

num_features : 86
num_features : 85
num_features : 84
num_features : 83
num_features : 82
num_features : 81
num_features : 80
num_features : 79
num_features : 78
num_features : 77
num_features : 76
num_features : 75
num_features : 74
num_features : 73
num_features : 72
num_features : 71
num_features : 70
num_features : 69
num_features : 68
num_features : 67


(['Time_Sequence',
  'x1',
  'x2',
  'x3',
  'x4',
  'x6',
  'x7',
  'x9',
  'x10',
  'x11',
  'x12',
  'x13',
  'x14',
  'x15',
  'x16',
  'x17',
  'x18',
  'x19',
  'x20',
  'x21',
  'x22',
  'x23',
  'x24',
  'x25',
  'x26',
  'x28',
  'x29',
  'x32',
  'x33',
  'x34',
  'x35',
  'x36',
  'x37',
  'x38',
  'x41',
  'x42',
  'x44',
  'x45',
  'x47',
  'x49',
  'x50',
  'x51',
  'x52',
  'x53',
  'x54',
  'x56',
  'x58',
  'x61',
  'x62',
  'x65',
  'x67',
  'x68',
  'x70',
  'x72',
  'x73',
  'x74',
  'x75',
  'x76',
  'x77',
  'x78',
  'x79',
  'x80',
  'x81',
  'x82',
  'x83',
  'x84',
  'x85'],
 -0.5869556807772816)

In [109]:
knn_set, knn_score = grid_select_bottomup(KNeighborsRegressor(n_neighbors=1, weights='uniform', p=2), X_train, y_train, cv=kfold, scoring=scorer, n_jobs=-1, bottom=knn_set)
knn_set, knn_score

num_features : 68
num_features : 69


(['Time_Sequence',
  'x1',
  'x2',
  'x3',
  'x4',
  'x6',
  'x7',
  'x9',
  'x10',
  'x11',
  'x12',
  'x13',
  'x14',
  'x15',
  'x16',
  'x17',
  'x18',
  'x19',
  'x20',
  'x21',
  'x22',
  'x23',
  'x24',
  'x25',
  'x26',
  'x28',
  'x29',
  'x32',
  'x33',
  'x34',
  'x35',
  'x36',
  'x37',
  'x38',
  'x41',
  'x42',
  'x44',
  'x45',
  'x47',
  'x49',
  'x50',
  'x51',
  'x52',
  'x53',
  'x54',
  'x56',
  'x58',
  'x61',
  'x62',
  'x65',
  'x67',
  'x68',
  'x70',
  'x72',
  'x73',
  'x74',
  'x75',
  'x76',
  'x77',
  'x78',
  'x79',
  'x80',
  'x81',
  'x82',
  'x83',
  'x84',
  'x85',
  'x64'],
 -0.5868784115438085)

In [112]:
gs_knn = GridSearchCV(KNeighborsRegressor(p=2), {'n_neighbors':[1,2,3,4,5,6,7], 'weights':['uniform', 'distance']}, scoring=scorer, n_jobs=-1, cv=kfold)
gs_knn.fit(X_train.loc[:, knn_set], y_train.values)
gs_knn.best_score_, gs_knn.best_params_

(-0.5868479172124043, {'n_neighbors': 1, 'weights': 'uniform'})

In [113]:
knn = KNeighborsRegressor(n_neighbors=1, weights='uniform', p=2)
knn.fit(X_train.loc[:, knn_set], y_train)
wmae(y_train.values, knn.predict(X_train.loc[:, knn_set]))

0.0

In [115]:
knn.predict(X_test.loc[:, knn_set])

array([ 2.58179653,  3.33903699, 16.28358384,  1.46418484,  1.97583318,
        1.50017081,  2.7034235 ,  1.53615677,  1.24775737,  1.68044179,
        1.46418484,  1.24775737,  1.31989986,  1.10347237,  1.46418484,
        1.46418484,  1.57231332,  2.07714023,  1.39204235,  1.28374334,
        1.28374334,  1.39204235,  2.43476772,  1.24775737,  1.28374334,
        1.17561487,  1.24775737,  1.1396289 ,  1.57397914,  4.20440572,
        1.1396289 ,  1.21177139,  2.18526867,  1.28374334,  1.21177139,
        1.17561487,  1.46418484,  1.31989986,  1.31989986,  3.05624056,
        1.35588583,  1.46418484,  1.39204235,  1.39204235,  1.46418484,
        1.64445579,  1.24775737,  1.24775737,  1.24775737,  1.35588583,
        1.28134739,  1.31989986,  1.24775737,  1.24775737,  1.35588583,
        1.39204235,  3.158766  ,  1.28374334,  1.24775737,  1.21177139,
        1.21177139,  1.57231332,  1.53615677,  1.17561487,  1.17561487,
        1.42802834,  1.17561487,  0.44543362,  1.10347237,  1.35

In [121]:
X_test_ = X_test.copy()
X_test_['Y'] = knn.predict(X_test.loc[:, knn_set])
pred_ = X_test_.iloc[:,[0,-1]]

In [124]:
result = pd.read_csv('quality_result.csv')
result['Y'] = pred_['Y'].values
result.to_csv('quality_result.csv', index=False)