In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from time import time
import matplotlib.pyplot as plt

### Time feature data

In [2]:
model = GradientBoostingRegressor()

In [3]:
def return_data(fold,month,with_scaling):
    train_input = pd.read_csv('../data/time_feature'+'/fold'+str(fold)+'/train_data_'+month+'.csv.gz')
    test_input = pd.read_csv('../data/time_feature'+'/fold'+str(fold)+'/test_data_'+month+'.csv.gz')
    test_output = np.array(test_input['PM25_Concentration'])
    train_output = np.array(train_input['PM25_Concentration'])
    train_input= train_input.drop(['station_id','PM25_Concentration','time','filled'],axis=1)
    try:
        test_input= test_input.drop(['PM25_Concentration','station_id','time','filled'],axis=1)
    except:
        test_input= test_input.drop(['station_id','time','filled'],axis=1)
#     test_output= test_output.drop(['time'],axis=1)
    if with_scaling:
        scaler = MinMaxScaler().fit(train_input)
        train_input = scaler.transform(train_input)
        test_input = scaler.transform(test_input)
    return train_input,train_output,test_input,test_output

def run_model(model,train_input,train_output,test_input,test_output,ret_output):
    model.fit(np.array(train_input), train_output)
    test_pred = model.predict(np.array(test_input))
    err = mean_squared_error(test_pred, test_output, squared=False)
    if ret_output:
        return err,test_pred
    else:
        return err

### With scaling

In [4]:
for fold in [0,1,2]:
    train_input,train_output,test_input,test_output = return_data(fold=fold,month='mar',with_scaling=True)
    print("Fold: ",fold)
    print("Data received")
    init = time()
    rmse,test_pred = run_model(model,train_input,train_output,test_input,test_output,True)
    print("RMSE: ",rmse)
    print("Time taken: ",time()-init)
    test_input = pd.read_csv('../data/time_feature'+'/fold'+str(fold)+'/test_data_'+'mar'+'.csv.gz')
    test_input['prediction']= test_pred
    test_input.to_csv('../data/'+'results/results_mar'+'/fold'+str(fold)+'/XGB_scaled.csv.gz')

Fold:  0
Data received
RMSE:  34.06744919117983
Time taken:  1.358201265335083
Fold:  1
Data received
RMSE:  34.234847371424365
Time taken:  1.3401432037353516
Fold:  2
Data received
RMSE:  33.24908224425141
Time taken:  1.3460073471069336
