In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from time import time
import matplotlib.pyplot as plt

In [2]:
model = SVR(kernel='rbf',C=1,epsilon=0.1)

In [3]:
def return_data(fold,month,with_scaling):
    train_input = pd.read_csv('../data/time_feature'+'/fold'+str(fold)+'/train_data_'+month+'.csv.gz')
    test_input = pd.read_csv('../data/time_feature'+'/fold'+str(fold)+'/test_data_'+month+'.csv.gz')
    test_output = np.array(test_input['PM25_Concentration'])
    train_output = np.array(train_input['PM25_Concentration'])
    train_input= train_input.drop(['station_id','PM25_Concentration','time','filled'],axis=1)
    try:
        test_input= test_input.drop(['PM25_Concentration','station_id','time','filled'],axis=1)
    except:
        test_input= test_input.drop(['station_id','time','filled'],axis=1)
#     test_output= test_output.drop(['time'],axis=1)
    if with_scaling:
        scaler = MinMaxScaler().fit(train_input)
        train_input = pd.DataFrame(scaler.transform(train_input),columns=list(train_input.columns))
        test_input = pd.DataFrame(scaler.transform(test_input),columns=list(test_input.columns))
    return train_input,train_output,test_input,test_output

def run_model(model,train_input,train_output,test_input,test_output,ret_output):
    model.fit(np.array(train_input), train_output)
    test_pred = model.predict(np.array(test_input))
    err = mean_squared_error(test_pred, test_output, squared=False)
    if ret_output:
        return err,test_pred
    else:
        return err

In [4]:
for fold in [0,1,2]:
    train_input,train_output,test_input,test_output = return_data(fold=fold,month='mar',with_scaling=True)
#     train_input = train_input.loc[:,['delta_t','humidity']]
#     test_input = test_input.loc[:,['delta_t','humidity']]
    print("Fold: ",fold)
    print("Data received")
    init = time()
    rmse,test_pred = run_model(model,train_input,train_output,test_input,test_output,True)
    print("RMSE: ",rmse)
    print("Time taken: ",time()-init)

Fold:  0
Data received
RMSE:  57.75882365586619
Time taken:  19.846774578094482
Fold:  1
Data received
RMSE:  49.872698111040734
Time taken:  20.010626077651978
Fold:  2
Data received
RMSE:  56.40413890083982
Time taken:  19.454524040222168


In [9]:
# kernel = ['rbf']
# degree = [2,3,5]
# gamma = ['scale','auto']
C = [0.1,0.5,1,2,3]
epsilon = [0.1,0.5,1]

random_grid = {'C': C,
               'epsilon': epsilon}

model = GridSearchCV(SVR(),random_grid)

In [10]:
for fold in [0]:
    train_input,train_output,test_input,test_output = return_data(fold=fold,month='mar',with_scaling=True)
#     train_input = train_input.loc[:,['delta_t','humidity']]
#     test_input = test_input.loc[:,['delta_t','humidity']]
    print("Fold: ",fold)
    print("Data received")
    init = time()
    rmse,test_pred = run_model(model,train_input,train_output,test_input,test_output,True)
    print("RMSE: ",rmse)
    print("Time taken: ",time()-init)
    model.get_params()

Fold:  0
Data received
RMSE:  54.59397595237452
Time taken:  794.7578637599945


In [11]:
model.get_params()

{'cv': None,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__cache_size': 200,
 'estimator__coef0': 0.0,
 'estimator__degree': 3,
 'estimator__epsilon': 0.1,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVR(),
 'n_jobs': None,
 'param_grid': {'C': [0.1, 0.5, 1, 2, 3], 'epsilon': [0.1, 0.5, 1]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

In [39]:
model = LinearRegression()

In [40]:
for fold in [0,1,2]:
    train_input,train_output,test_input,test_output = return_data(fold=fold,month='mar',with_scaling=True)
#     train_input = train_input.loc[:,['delta_t','humidity']]
#     test_input = test_input.loc[:,['delta_t','humidity']]
    print("Fold: ",fold)
    print("Data received")
    init = time()
    rmse,test_pred = run_model(model,train_input,train_output,test_input,test_output,True)
    print("RMSE: ",rmse)
    print("Time taken: ",time()-init)

Fold:  0
Data received
RMSE:  54.64386656662314
Time taken:  0.015205144882202148
Fold:  1
Data received
RMSE:  49.1655519496443
Time taken:  0.01169586181640625
Fold:  2
Data received
RMSE:  53.17154581782867
Time taken:  0.011575698852539062
