<h1> Code for Lychee Yield Prediction </h1>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
# multivariate linear regression with regularization
from sklearn.linear_model import LinearRegression, Ridge, Lasso
# support vector machine regression
from sklearn.svm import SVR
# neural network
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding
# normalization
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

import pickle
from calendar import monthrange

Using TensorFlow backend.


In [2]:
def clean_text_to_number(df):
    '''
    convert all text to 0
    '''
    cols = df.columns
    type_list = []
    for col in cols:
        print(col)
        try:
            df[col].astype(float)
        except:
            for i in range(df[col].shape[0]):
                if isinstance(df[col].iloc[i], str):
                    df[col].iloc[i] = 0
    return df

In [3]:
# import data frame
rain_df     = pd.read_excel('rain_amount_2003-2019.xlsx')
humid_df    = pd.read_excel('relative_humid_2003-2019.xlsx')
temp_df     = pd.read_excel('temp_2003-2019.xlsx')
area_df     = pd.read_excel('area_2003-2019.xls', sheet_name = 'Sheet1')
lychee_yield_df = pd.read_excel('lycheeproduct.xlsx')

# extract data
rain_df     = rain_df.iloc[5:-5, :]
humid_df    = humid_df.iloc[5:-5, :]
temp_df     = temp_df.iloc[5:-5, :]

# reset index
rain_df     = rain_df.reset_index().drop(columns=['index'])
humid_df    = humid_df.reset_index().drop(columns=['index'])
temp_df     = temp_df.reset_index().drop(columns=['index'])

# set column name
rain_df.columns     = ['days', 'location', 'date', '1', '4', '7', '10', '13', '16', '19', '22', 'total']
humid_df.columns    = ['days', 'location', 'date', '1', '4', '7', '10', '13', '16', '19', '22', 'mean']
temp_df.columns     = ['days', 'location', 'date', '1', '4', '7', '10', '13', '16', '19', '22', 'mean']
lychee_yield_df.columns = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'total', 'year']
area_df.columns     = ['year', 'district', 'code', 'province', 'allarea', 'yieldarea', 'yield', 'yieldperarea']
area_df['year']     = area_df['year'] - 543

# ดึงค่า date เก็บไว้ก่อน
all_datetime    = pd.to_datetime(rain_df['date'])

rain_df     = rain_df.drop(columns = ['location', 'days', 'date'])
humid_df    = humid_df.drop(columns =['location', 'days', 'date'])
temp_df     = temp_df.drop(columns =['location', 'days', 'date'])
area_df     = area_df.drop(columns = ['province'])

print('rain amount dataframe')
print(rain_df.shape)
print('humid dataframe')
print(humid_df.shape)
print('temperature dataframe')
print(temp_df.shape)
print('rain dataframe sample')
print(rain_df.head(5))
print('area dataframe sample')
print(area_df.head(5))

rain amount dataframe
(6083, 9)
humid dataframe
(6083, 9)
temperature dataframe
(6083, 9)
rain dataframe sample
     1    4    7   10 13    16    19   22 total
0    0    0    0    0  0     0     0  1.2   1.2
1  4.4  0.2    5  2.6  0  17.4     0    0  29.6
2    0    0    0    0  0     0     0    0     -
3    0    0    0    0  0     0     0    0     -
4    0  0.6  0.7  2.5  2   7.3  15.1  1.4  29.6
area dataframe sample
   year  district  code  allarea  yieldarea      yield  yieldperarea
0  1994         1    10    24562      16404   8366.040           510
1  1995         1    10    22053      15021   5798.106           386
2  1996         1    10    27955      18807   8068.203           429
3  1997         1    10    31203      23278  11429.498           491
4  1998         1    10    34200      23519   1481.697            63


In [4]:
# # get datetime from 3 hour data set
# all_year        = pd.DataFrame(all_datetime.dt.year)
# all_month       = pd.DataFrame(all_datetime.dt.month)
# all_day         = pd.DataFrame(all_datetime.dt.day)

# all_year.columns    = ['year']
# all_month.columns   = ['month']
# all_day.columns     = ['day']
# # concat to existing dataframe
# rain_df = pd.concat([all_year, all_month, all_day, rain_df], axis = 1)
# humid_df = pd.concat([all_year, all_month, all_day, humid_df], axis = 1)
# temp_df = pd.concat([all_year, all_month, all_day, temp_df], axis = 1)

# # clean text element and save in pickle form

# rain_df_clean = clean_text_to_number(rain_df)
# with open('rain_df_clean.pickle', 'wb') as f:
#     pickle.dump(rain_df_clean, f)
# humid_df_clean = clean_text_to_number(humid_df)
# with open('humid_df_clean.pickle', 'wb') as f:
#     pickle.dump(humid_df_clean, f)
# temp_df_clean = clean_text_to_number(temp_df)
# with open('temp_df_clean.pickle', 'wb') as f:
#     pickle.dump(temp_df_clean, f)

In [5]:
print(rain_df.head(5))
print(humid_df.head(5))
print(temp_df.head(5))
print(area_df.head(5))
print(lychee_yield_df.head(5))

1    4    7   10 13    16    19   22 total
0    0    0    0    0  0     0     0  1.2   1.2
1  4.4  0.2    5  2.6  0  17.4     0    0  29.6
2    0    0    0    0  0     0     0    0     -
3    0    0    0    0  0     0     0    0     -
4    0  0.6  0.7  2.5  2   7.3  15.1  1.4  29.6
    1   4   7  10  13  16  19  22 mean
0  93  95  95  87  75  73  89  93   88
1  94  95  94  94  87  91  94  95   93
2  95  95  95  87  70  60  92  94   86
3  96  95  94  81  70  62  81  85   83
4  88  93  93  91  85  95  95  95   92
      1     4     7    10    13    16    19    22  mean
0  19.2  18.5  18.2    21  23.5  24.3  22.1  21.3    21
1    21  20.9  20.8  21.2  23.5  22.5  21.2  20.5  21.5
2  19.5    20  19.5  21.3  24.6  27.5  22.2  19.5  21.8
3  19.4  19.6  19.8  21.7  24.2  26.3  23.4    22  22.1
4  21.1  20.5  20.3    20    21  19.5  19.3  19.5  20.2
   year  district  code  allarea  yieldarea      yield  yieldperarea
0  1994         1    10    24562      16404   8366.040           510
1  1995  

<h1> Load Data from pickle </h1> since clean dataframe take very long time. So after we clean it, we save dataframe in pickle form.

In [6]:
with open('rain_df_clean.pickle', 'rb') as f:
    rain_df_clean = pickle.load(f)
with open('humid_df_clean.pickle', 'rb') as f:
    humid_df_clean = pickle.load(f)
with open('temp_df_clean.pickle', 'rb') as f:
    temp_df_clean = pickle.load(f)

In [7]:
def groupby_col(self, col):
    '''
    return 
        1 keys 
        2 dictionary of sub_df ที่ keys คือแต่ละ element ใน col name
    '''

    output_dic = {}
    all_ele = sorted(list(set(self[col])))

    for ele in all_ele:
        sub_df = self.loc[self[col] == ele, :]
        output_dic[ele] = sub_df

    return all_ele, output_dic

def groupby_mean(self, col):
    '''
    return 
        1 keys 
        2 dictionary of sub_df ที่ keys คือแต่ละ element ใน col name
    '''

    output_dic = {}
    all_ele = sorted(list(set(self[col])))

    for ele in all_ele:
        sub_df = self.loc[self[col] == ele, :].mean(axis=0)
        output_dic[ele] = sub_df
    
    return all_ele, output_dic

def groupby_max(self, col):
    '''
    return 
        1 keys 
        2 dictionary of sub_df ที่ keys คือแต่ละ element ใน col name
    '''

    output_dic = {}
    all_ele = sorted(list(set(self[col])))

    for ele in all_ele:
        sub_df = self.loc[self[col] == ele, :].max(axis = 0)
        output_dic[ele] = sub_df
    
    return all_ele, output_dic

def groupby_min(self, col):
    '''
    return 
        1 keys 
        2 dictionary of sub_df ที่ keys คือแต่ละ element ใน col name
    '''

    output_dic = {}
    all_ele = sorted(list(set(self[col])))

    for ele in all_ele:
        sub_df = self.loc[self[col] == ele, :].min(axis = 0)
        output_dic[ele] = sub_df
    
    return all_ele, output_dic

def groupby_sum(self, col):
    '''
    return 
        1 keys 
        2 dictionary of sub_df ที่ keys คือแต่ละ element ใน col name
    '''

    output_dic = {}
    all_ele = sorted(list(set(self[col])))

    for ele in all_ele:
        sub_df = self.loc[self[col] == ele, :].sum(axis = 0)
        output_dic[ele] = sub_df
    
    return all_ele, output_dic

# set method to class
setattr(pd.core.frame.DataFrame, 'groupby_col', groupby_col)
setattr(pd.core.frame.DataFrame, 'groupby_mean', groupby_mean)
setattr(pd.core.frame.DataFrame, 'groupby_sum', groupby_sum)
setattr(pd.core.frame.DataFrame, 'groupby_max', groupby_max)
setattr(pd.core.frame.DataFrame, 'groupby_min', groupby_min)

<h1> Year Month Temp Humid Rain Yield Dayinmonth Area</h1>

In [8]:
## Temp 2004 - 2019
monthly_temp = np.array([1,1,1]).reshape(1,3)

all_year, sub_year_dic = temp_df_clean.groupby_col('year')

for year in all_year:
        all_month, sub_month_dic = sub_year_dic[year].groupby_mean('month')

        for month in all_month:
                # record year month and mean temp of each month
                daily_temp = np.array([year, month, sub_month_dic[month].iloc[-1]]).reshape(1,3)
                monthly_temp = np.append(monthly_temp, daily_temp, axis = 0)

monthly_temp = np.delete(monthly_temp, 0, axis = 0)
print(monthly_temp.shape)

(200, 3)


In [9]:
## Humid 2004 - 2019
monthly_humid = np.array([1,1,1]).reshape(1,3)

all_year, sub_year_dic = humid_df_clean.groupby_col('year')

for year in all_year:
        all_month, sub_month_dic = sub_year_dic[year].groupby_mean('month')

        for month in all_month:
                # record year month and mean temp of each month
                daily_humid = np.array([year, month, sub_month_dic[month].iloc[-1]]).reshape(1,3)
                monthly_humid = np.append(monthly_humid, daily_humid, axis = 0)

monthly_humid = np.delete(monthly_humid, 0, axis = 0)
print(monthly_humid.shape)

(200, 3)


In [10]:
## Rain 2004 - 2019
monthly_rain = np.array([1,1,1]).reshape(1,3)

all_year, sub_year_dic = rain_df_clean.groupby_col('year')

for year in all_year:
        all_month, sub_month_dic = sub_year_dic[year].groupby_mean('month')

        for month in all_month:
                # record year month and mean temp of each month
                daily_rain = np.array([year, month, sub_month_dic[month].iloc[-1]]).reshape(1,3)
                monthly_rain = np.append(monthly_rain, daily_rain, axis = 0)

monthly_rain = np.delete(monthly_rain, 0, axis = 0)
print(monthly_rain.shape)

(200, 3)


In [11]:
## Lychee yield 2004 - 2018
monthly_lychee = np.array([1,1,1]).reshape(1,3)

all_year, _ = rain_df_clean.groupby_col('year')

for year in all_year:
        all_month, _ = sub_year_dic[year].groupby_mean('month')

        for month in all_month:
                # record year month and lychee yield of each month
                monthly_lychee_temp = lychee_yield_df.loc[lychee_yield_df['year']==year,:]
                if monthly_lychee_temp.size != 0:
                    monthly_yield = np.array([year, month, monthly_lychee_temp.iloc[0, month-1]]).reshape(1,3)
                    monthly_lychee = np.append(monthly_lychee, monthly_yield, axis = 0)

monthly_lychee = np.delete(monthly_lychee, 0, axis = 0)
print(monthly_lychee.shape)

(180, 3)


In [12]:
## area yield 1994 - 2018
monthly_area = np.array([1,1,1]).reshape(1,3)

all_year, _ = rain_df_clean.groupby_col('year')

for year in all_year:
        all_month, _ = sub_year_dic[year].groupby_mean('month')

        for month in all_month:
                # record year month and lychee yield of each month
                monthly_area_temp = area_df.loc[area_df['year']==year,:]
                if monthly_area_temp.size != 0:
                    monthly_area_temp = np.array([year, month, monthly_area_temp.iloc[0, -1]]).reshape(1,3)
                    monthly_area = np.append(monthly_area, monthly_area_temp, axis = 0)

monthly_area = np.delete(monthly_area, 0, axis = 0)
print(monthly_area.shape)

(192, 3)


In [13]:
## month range 2004 - 2019
monthly_numday = np.array([1,1,1]).reshape(1,3)

all_year, _ = rain_df_clean.groupby_col('year')

for year in all_year:
        all_month, _ = sub_year_dic[year].groupby_mean('month')

        for month in all_month:
                # record year month and lychee yield of each month
                monthly_numday_temp = np.array([year, month, monthrange(year, month)[1]]).reshape(1,3)
                monthly_numday = np.append(monthly_numday,monthly_numday_temp, axis = 0)

monthly_numday = np.delete(monthly_numday, 0, axis = 0)
print(monthly_numday.shape)

(200, 3)


Intersect year of all data is 2004 - 2018

<h1> Method for getting X data </h1>

In [111]:
def rmse(y_true, y_pred):

    ind_ignorezero = (y_true != 0).reshape(-1,)
    error = (y_true - y_pred)
    se = error**2
    mse = np.mean(se)
    rmse = mse**0.5
    return rmse

In [15]:
def get_info_from(info, year_fr, month_fr, year_to, month_to):
    '''
    extract info since year and month from np array input
    but the first 2 row have to be year and month respectively
    '''

    year2month = info[:, 0]*12 + info[:,1]
    target_year2month = year_fr*12 + month_fr
    target_year2month_to = year_to*12 + month_to

    return info[np.logical_and(year2month >= target_year2month, year2month <= target_year2month_to)]

In [16]:
def get_X_thr(info, year_fr, month_fr, year_to, month_to):
    # input
    all_year = [i for i in range(2004, 2017)]
    # temp as a input
    X_temp_1 = get_info_from(info, year_fr, month_fr, year_to, month_to-1)
    X_temp_2 = get_info_from(info, year_fr+1, (month_fr+1)%12, year_to, month_to)  

    
    # only consider month 12, 1, 2, 3
    for i in [3, 4, 5, 6, 7, 8, 9, 10, 11]:
        X_temp_1 = X_temp_1[X_temp_1[:, 1] != i]
    for i in [ 4, 5, 6, 7, 8, 9, 10, 11, 12]:
        X_temp_2 = X_temp_2[X_temp_2[:, 1] != i]
    
    # get rid of year and month
    X_temp_1 = np.append(X_temp_1, X_temp_2[:, -1].reshape(-1,1), axis = 1)
    
    return X_temp_1

In [17]:
def get_X_area(info, year_fr, month_fr, year_to, month_to):
    # input
    all_year = [i for i in range(2004, 2017)]

    X_temp_1 = get_info_from(info, year_fr, month_fr, year_to, month_to-1)

    # only consider month 12, 1, 2, 3
    for i in [4, 5, 6, 7, 8, 9, 10, 11, 12]:
        X_temp_1 = X_temp_1[X_temp_1[:, 1] != i]    
    
    return X_temp_1

In [18]:
def get_X_dayinmonth(info, year_fr, month_fr, year_to, month_to):
    # input
    all_year = [i for i in range(2004, 2017)]

    X_temp_1 = get_info_from(info, year_fr, month_fr, year_to, month_to-1)

    # only consider month 12, 1, 2, 3
    for i in [1, 2, 3, 7, 8, 9, 10, 11, 12]:
        X_temp_1 = X_temp_1[X_temp_1[:, 1] != i]    
    
    return X_temp_1

In [19]:
def mae(y_true, y_pred):

    error = y_true - y_pred
    ae = np.abs(error)
    mae = np.mean(ae)

    return mae

Get y

In [20]:
Y_all = get_X_dayinmonth(monthly_lychee, 2005,1, 2018,12)

Y = Y_all[:, -1]
print(Y.shape)

(42,)


<h1> Feature selection </h1>
Year (of the target), Month (of the target), Avg.Temp (3 and 4 month before), Avg.Humid (3 and 4 month before),
Rain amount (3 and 4 month before), lychee yield per area (1 year before), number of the day in target month,
lychee yield (of target month but 1 year before)

In [26]:
X_monthly_temp  = get_X_thr(monthly_temp, 2004, 12, 2018, 3)
X_monthly_humid = get_X_thr(monthly_humid, 2004, 12, 2018, 3)
X_monthly_rain  = get_X_thr(monthly_rain, 2004, 12, 2018, 3)

X_monthly_area   = get_X_area(monthly_area, 2004,1, 2017,12)
X_monthly_month_range = get_X_dayinmonth(monthly_numday, 2004,1, 2017,12)
X_monthly_yield = get_X_dayinmonth(monthly_lychee, 2004,1, 2017,12)

print(X_monthly_temp.shape)
print(X_monthly_humid.shape)
print(X_monthly_rain.shape)
print(X_monthly_area.shape)
print(X_monthly_month_range.shape)
print(X_monthly_yield.shape)

# X
# Year (of the target), Month (of the target), Avg.Temp (3 and 4 month before), Avg.Humid (3 and 4 month before),
# Rain amount (3 and 4 month before), lychee yield per area (1 year before), 
# lychee yield (of target month but 1 year before)
X = np.concatenate([Y_all[:, 0:2], X_monthly_temp[:, 2:], X_monthly_humid[:,2:], X_monthly_rain[:, 2:], X_monthly_area[:, 2:], X_monthly_yield[:, 2:]], axis = 1)

print(X.shape)

(42, 4)
(42, 4)
(42, 4)
(42, 3)
(42, 3)
(42, 3)
(42, 10)


Seperate train set and test set

In [27]:
X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, test_size=0.3)
# sss = StratifiedShuffleSplit(n_splits=5, test_size=0.27)
print('train size')
print(X_train.shape)
print('test size')
print(X_test.shape)

train size
(29, 10)
test size
(13, 10)


<h1> Ridge Model w/o Normalization </h1>

In [114]:
# vector for record error
train_mae = np.array([])
test_mae = np.array([])

train_rmse = np.array([])
test_rmse = np.array([])

# minmaxscaler

for i in range(100):

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

    # scale data
    reg_lr = Ridge(alpha = 0.5)
    reg_lr.fit(X_train, Y_train)

    Y_test_pred = reg_lr.predict(X_test)
    Y_test_pred[Y_test_pred<0] = 0

    Y_train_pred = reg_lr.predict(X_train)
    Y_train_pred[Y_train_pred<0] = 0

    train_mae = np.append(train_mae, mae(Y_train, Y_train_pred))
    test_mae = np.append(test_mae, mae(Y_test, Y_test_pred))

    train_rmse = np.append(train_rmse, rmse(Y_train, Y_train_pred))
    test_rmse = np.append(test_rmse, rmse(Y_test, Y_test_pred))    

print('Train set MAE {:.2f}'.format(np.mean(train_mae)))
print('Test set MAE {:.2f}'.format(np.mean(test_mae)))

print('Train set RMSE {:.2f}'.format(np.mean(train_rmse)))
print('Test set RMSE {:.2f}'.format(np.mean(test_rmse)))

Train set MAE 2233.27
Test set MAE 3469.76
Train set RMSE 3027.22
Test set RMSE 4866.62


<h1> Ridge Model w Normalization </h1>

In [124]:
# vector for record error
train_mae = np.array([])
test_mae = np.array([])

train_rmse = np.array([])
test_rmse = np.array([])

# minmaxscaler
scaler = MinMaxScaler()

for i in range(100):

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

    # scale data
    scaler.fit(X_train)
    # reg = LinearRegression()
    # reg_lr_norm = Lasso(alpha = 0.5)
    reg_lr_norm = Ridge(alpha = 0.5)
    reg_lr_norm.fit(scaler.transform(X_train), Y_train)

    Y_test_pred = reg_lr_norm.predict(scaler.transform(X_test))
    Y_test_pred[Y_test_pred<0] = 0

    Y_train_pred = reg_lr_norm.predict(scaler.transform(X_train))
    Y_train_pred[Y_train_pred<0] = 0

    train_mae = np.append(train_mae, mae(Y_train, Y_train_pred))
    test_mae = np.append(test_mae, mae(Y_test, Y_test_pred))

    train_rmse = np.append(train_rmse, rmse(Y_train, Y_train_pred))
    test_rmse = np.append(test_rmse, rmse(Y_test, Y_test_pred))    

print('Train set MAE {:.2f}'.format(np.mean(train_mae)))
print('Test set MAE {:.2f}'.format(np.mean(test_mae)))

print('Train set RMSE {:.2f}'.format(np.mean(train_rmse)))
print('Test set RMSE {:.2f}'.format(np.mean(test_rmse)))

Train set MAE 2361.28
Test set MAE 3319.21
Train set RMSE 3234.11
Test set RMSE 4551.77


In [125]:
# show coef of each feature
feature_list = ['Year (of the target)',' Month (of the target)', 'Avg.Temp (3 and 4 month before)',' Avg.Humid (3 and 4 month before)', 'Rain amount (3 and 4 month before)', 'lychee yield per area (1 year before)','lychee yield (of target month but 1 year before)']
for coef, feature in zip(reg_lr_norm.coef_, feature_list):
    print('lambda {:.2f} feature : {:s}'.format(coef, feature))

lambda -2835.61 feature : Year (of the target)
lambda 3929.55 feature :  Month (of the target)
lambda -7367.51 feature : Avg.Temp (3 and 4 month before)
lambda -3908.33 feature :  Avg.Humid (3 and 4 month before)
lambda 112.08 feature : Rain amount (3 and 4 month before)
lambda -1146.56 feature : lychee yield per area (1 year before)
lambda -24.64 feature : lychee yield (of target month but 1 year before)


<h1> Support Vector Regression w/o Normalization </h1>

In [119]:
# vector for record error
train_mae = np.array([])
test_mae = np.array([])

train_rmse = np.array([])
test_rmse = np.array([])

# minmaxscaler

for i in range(50):

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

    # scale data
    reg_svr = SVR(kernel = 'linear' ,gamma = 'auto', C=1)
    reg_svr.fit(X_train, Y_train)

    Y_test_pred = reg_svr.predict(X_test)
    Y_test_pred[Y_test_pred<0] = 0

    Y_train_pred = reg_svr.predict(X_train)
    Y_train_pred[Y_train_pred<0] = 0

    train_mae = np.append(train_mae, mae(Y_train, Y_train_pred))
    test_mae = np.append(test_mae, mae(Y_test, Y_test_pred))

    train_rmse = np.append(train_rmse, rmse(Y_train, Y_train_pred))
    test_rmse = np.append(test_rmse, rmse(Y_test, Y_test_pred))    

print('Train set MAE {:.2f}'.format(np.mean(train_mae)))
print('Test set MAE {:.2f}'.format(np.mean(test_mae)))

print('Train set RMSE {:.2f}'.format(np.mean(train_rmse)))
print('Test set RMSE {:.2f}'.format(np.mean(test_rmse)))

Train set MAE 2307.27
Test set MAE 2663.03
Train set RMSE 4175.42
Test set RMSE 4479.70


In [120]:
Y_all_test = reg_svr.predict(X)
Y_all_test[Y_all_test<0] = 0
for year, month, y_true, y_pred in zip(Y_all[:,0], Y_all[:,1], Y, Y_all_test):
    print('year {:.0f} month {:.0f} true : {:.0f}\tpred : {:.0f}'.format(year, month, y_true, y_pred))

year 2005 month 4 true : 15463	pred : 852
year 2005 month 5 true : 2352	pred : 11779
year 2005 month 6 true : 3	pred : 4404
year 2006 month 4 true : 17044	pred : 12662
year 2006 month 5 true : 2592	pred : 2587
year 2006 month 6 true : 4	pred : 877
year 2007 month 4 true : 0	pred : 14035
year 2007 month 5 true : 16125	pred : 2968
year 2007 month 6 true : 3861	pred : 1066
year 2008 month 4 true : 21	pred : 544
year 2008 month 5 true : 13448	pred : 13133
year 2008 month 6 true : 586	pred : 3717
year 2009 month 4 true : 5430	pred : 641
year 2009 month 5 true : 11197	pred : 11188
year 2009 month 6 true : 1625	pred : 1260
year 2010 month 4 true : 0	pred : 4882
year 2010 month 5 true : 6476	pred : 9589
year 2010 month 6 true : 2775	pred : 2205
year 2011 month 4 true : 0	pred : 429
year 2011 month 5 true : 2867	pred : 5682
year 2011 month 6 true : 5236	pred : 2765
year 2012 month 4 true : 155	pred : 418
year 2012 month 5 true : 6486	pred : 2815
year 2012 month 6 true : 4683	pred : 4682
year 20

<h1> Support Vector Regression w Normalization </h1>

In [121]:
# vector for record error
train_mae = np.array([])
test_mae = np.array([])

train_rmse = np.array([])
test_rmse = np.array([])

# minmaxscaler
scaler = StandardScaler()

for i in range(50):

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

    # scale data
    scaler.fit(X_train)
    reg_svr_norm = SVR(kernel = 'poly' ,gamma = 'scale', C=1)
    reg_svr_norm.fit(scaler.transform(X_train), Y_train)

    Y_test_pred = reg_svr_norm.predict(scaler.transform(X_test))
    Y_test_pred[Y_test_pred<0] = 0

    Y_train_pred = reg_svr_norm.predict(scaler.transform(X_train))
    Y_train_pred[Y_train_pred<0] = 0

    train_mae = np.append(train_mae, mae(Y_train, Y_train_pred))
    test_mae = np.append(test_mae, mae(Y_test, Y_test_pred))

    train_rmse = np.append(train_rmse, rmse(Y_train, Y_train_pred))
    test_rmse = np.append(test_rmse, rmse(Y_test, Y_test_pred))    

print('Train set MAE {:.2f}'.format(np.mean(train_mae)))
print('Test set MAE {:.2f}'.format(np.mean(test_mae)))

print('Train set RMSE {:.2f}'.format(np.mean(train_rmse)))
print('Test set RMSE {:.2f}'.format(np.mean(test_rmse)))

Train set MAE 3163.94
Test set MAE 3500.16
Train set RMSE 4775.26
Test set RMSE 4881.38


In [122]:
Y_all_test = reg_svr.predict(X)
Y_all_test[Y_all_test<0] = 0
for year, month, y_true, y_pred in zip(Y_all[:,0], Y_all[:,1], Y, Y_all_test):
    print('year {:.0f} month {:.0f} true : {:.0f}\tpred : {:.0f}'.format(year, month, y_true, y_pred))

year 2005 month 4 true : 15463	pred : 852
year 2005 month 5 true : 2352	pred : 11779
year 2005 month 6 true : 3	pred : 4404
year 2006 month 4 true : 17044	pred : 12662
year 2006 month 5 true : 2592	pred : 2587
year 2006 month 6 true : 4	pred : 877
year 2007 month 4 true : 0	pred : 14035
year 2007 month 5 true : 16125	pred : 2968
year 2007 month 6 true : 3861	pred : 1066
year 2008 month 4 true : 21	pred : 544
year 2008 month 5 true : 13448	pred : 13133
year 2008 month 6 true : 586	pred : 3717
year 2009 month 4 true : 5430	pred : 641
year 2009 month 5 true : 11197	pred : 11188
year 2009 month 6 true : 1625	pred : 1260
year 2010 month 4 true : 0	pred : 4882
year 2010 month 5 true : 6476	pred : 9589
year 2010 month 6 true : 2775	pred : 2205
year 2011 month 4 true : 0	pred : 429
year 2011 month 5 true : 2867	pred : 5682
year 2011 month 6 true : 5236	pred : 2765
year 2012 month 4 true : 155	pred : 418
year 2012 month 5 true : 6486	pred : 2815
year 2012 month 6 true : 4683	pred : 4682
year 20