In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from fancyimpute import mice
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, IterativeImputer, BiScaler
from sklearn.preprocessing import Imputer
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import math
import sklearn.metrics as sklm
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df_train_x = pd.read_csv('../data/train_new.csv')

df_train_y = df_train_x.iloc[:, -1]
df_train_x = df_train_x.iloc[:, :-1]

print(df_train_y.shape)
print(df_train_x.shape)

(1401,)
(1401, 16)


In [3]:
df_test = pd.read_csv('../data/test_x_new.csv')
df_test.shape

(616, 16)

# PCA

In [None]:
def pca_reduce(df):
    pca = PCA(n_components=10)
    pca = pca.fit(df)
    
    unit_vec = pca.components_ #得到投影之單位向量
    # print('單位向量:', unit_vec)
    
    reduced_data = pca.transform(df)
    print('降維後資料:', reduced_data.shape)
    
    return reduced_data

In [None]:
# df_train_x = pca_reduce(df_train_x)
# df_train_x.shape

In [None]:
# df_test = pca_reduce(df_test)
# df_test.shape

# Modeling

In [4]:
# Split train to 80% for training and 20% for validation
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(df_train_x, df_train_y.values, test_size=0.2)
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(1120, 16)
(1120,)
(281, 16)
(281,)


### Random Forest

In [5]:
# random forest
def rf_modeling(train_x, train_y):
    rf = RandomForestRegressor()
    param_grid = { 
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
    }

    grid_rf = GridSearchCV(rf, param_grid, n_jobs=-1, cv=5)
    forest_model = grid_rf.fit(train_x, train_y)
    
    return forest_model


forest_model = rf_modeling(train_x, train_y)
y_pred = forest_model.predict(test_x)

print('Root Mean Square Error = ' + str(math.sqrt(sklm.mean_squared_error(test_y, y_pred))))

Root Mean Square Error = 2.8300049508508383


In [6]:
rf_result = forest_model.predict(df_test)
rf_result.shape

(616,)

In [7]:
pd.Series(rf_result).to_csv('rf_result.csv', index=False)

### XGBoost

In [None]:
# XGBoost
def xgb_modeling(train_x, train_y):
    xgb_reg = XGBRegressor(n_estimators=100, max_depth=10,learning_rate=0.01,random_state=100,subsample=0.8)
    params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,11)],
              'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4]}
    grid = GridSearchCV(xgb_reg, params,scoring='neg_mean_squared_error',cv=5)
    
    xgb_model = grid.fit(train_x, train_y)
    return xgb_model


xgb_model = xgb_modeling(train_x, train_y)
y_pred = xgb_model.predict(test_x)

print('Root Mean Square Error = ' + str(math.sqrt(sklm.mean_squared_error(test_y, y_pred))))

In [None]:
xgb_result = xgb_model.predict(df_test)
xgb_result.shape

In [None]:
pd.Series(xgb_result).to_csv('xgb_result.csv', index=False)