In [1]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import warnings
from IPython.display import clear_output
from multiprocessing import Pool
from time import time
warnings.filterwarnings('ignore')

In [2]:
time_window = 24
max_time = 28

In [31]:
def get_train_features(train_data):
    
    time_varying_cols = list(train_data.columns)
    time_varying_cols.remove('station_id'),time_varying_cols.remove('latitude'),time_varying_cols.remove('longitude')
    col_names = []
    time_varying_cat = pd.DataFrame()  
    for col in time_varying_cols:
        tmp_df = pd.DataFrame(columns=list(range(time_window)))     
        for i in range(time_window,max_time):
            
            for station in train_data.station_id.unique():
                train_station = train_data[train_data.station_id==station]                 
                
                tmp_df = tmp_df.append(dict(zip(range(time_window),train_station.loc[time_range[i-time_window:i],col])),ignore_index=True)

        try:
            time_varying_cat = pd.concat([time_varying_cat,tmp_df],axis=1)
        except:
            time_varying_cat = tmp_df
    
    for col in ['latitude','longitude']:
        tmp_df = pd.DataFrame() 
        for i in range(time_window,max_time):
            for station in train_data.station_id.unique():
                train_station = train_data[train_data.station_id==station]
                tmp_df = tmp_df.append(pd.Series(train_station.loc[time_range[0],col]),ignore_index=True)
        time_varying_cat = pd.concat([time_varying_cat,tmp_df],axis=1)
    
    for col in time_varying_cols:
        
        for i in range(time_window):
                
            col_names.append(col+"_"+str(i)+"t")     
    
    col_names.extend(['latitude','longitude'])
    time_varying_cat.columns = col_names
    return time_varying_cat

In [35]:
def get_test_features(test_data):
    
    time_varying_cols = list(test_data.columns)
    time_varying_cols.remove('station_id'),time_varying_cols.remove('latitude'),time_varying_cols.remove('longitude'),time_varying_cols.remove('PM25_Concentration')
    col_names = []
    time_varying_cat = pd.DataFrame()  
    for col in time_varying_cols:
        tmp_df = pd.DataFrame(columns=list(range(time_window)))     
        for i in range(time_window,max_time):
            
            for station in test_data.station_id.unique():
                test_station = test_data[test_data.station_id==station]                 
                
                tmp_df = tmp_df.append(dict(zip(range(time_window),test_station.loc[time_range[i-time_window:i],col])),ignore_index=True)

        try:
            time_varying_cat = pd.concat([time_varying_cat,tmp_df],axis=1)
        except:
            time_varying_cat = tmp_df
    
    
    
    for col in ['latitude','longitude']:
        test_output = pd.DataFrame()
        tmp_df = pd.DataFrame() 
        for i in range(time_window,max_time):
            for station in test_data.station_id.unique():
                test_station = test_data[test_data.station_id==station]
                tmp_df = tmp_df.append(pd.Series(test_station.loc[time_range[0],col]),ignore_index=True)
                test_output = test_output.append(pd.Series(test_station.loc[time_range[i-1],'PM25_Concentration']),ignore_index=True)
                
        time_varying_cat = pd.concat([time_varying_cat,tmp_df],axis=1)
    
    for col in time_varying_cols:
        
        for i in range(time_window):
                
            col_names.append(col+"_"+str(i)+"t")   
    
    col_names.extend(['latitude','longitude'])
    time_varying_cat.columns = col_names
    return time_varying_cat,test_output

### Output: Rows - 20 stations * (max_time-time_window), Columns - 2 + 24*30

In [36]:
fold=0 # not using for loop to avoid ram overflow
print('fold', fold)

train_data = pd.read_csv('../data/processed/fold_'+str(fold)+'_train.csv.gz')
train_data['time'] = pd.to_datetime(train_data['time'])
train_data = train_data.set_index('time').sort_values(['time', 'station_id'])

test_data = pd.read_csv('../data/processed/fold_'+str(fold)+'_test.csv.gz')
test_data['time'] = pd.to_datetime(test_data['time'])
test_data = test_data.set_index('time').sort_values(['time', 'station_id'])

time_range = train_data.index.unique()

train_input = get_train_features(train_data)
test_input,test_output = get_test_features(test_data)

fold 0


In [37]:
display(test_input,test_output)

Unnamed: 0,temperature_0t,temperature_1t,temperature_2t,temperature_3t,temperature_4t,temperature_5t,temperature_6t,temperature_7t,temperature_8t,temperature_9t,...,wind_direction_24.0_16t,wind_direction_24.0_17t,wind_direction_24.0_18t,wind_direction_24.0_19t,wind_direction_24.0_20t,wind_direction_24.0_21t,wind_direction_24.0_22t,wind_direction_24.0_23t,latitude,longitude
0,20.0,18.0,18.0,17.0,17.0,16.0,16.0,17.0,18.0,20.0,...,0,1,1,1,1,1,1,1,39.914409,116.184239
1,20.0,18.0,18.0,17.0,17.0,16.0,16.0,17.0,18.0,20.0,...,0,1,1,1,1,1,1,1,39.987313,116.287451
2,20.0,18.0,18.0,17.0,17.0,16.0,16.0,17.0,18.0,20.0,...,0,1,1,1,1,1,1,1,39.937119,116.460742
3,20.0,18.0,18.0,17.0,17.0,16.0,16.0,17.0,18.0,20.0,...,0,1,1,1,1,1,1,1,39.939554,116.483746
4,19.0,19.0,19.0,19.0,18.0,17.0,17.0,17.0,19.0,21.0,...,0,1,1,1,1,1,1,1,39.718147,116.406155
5,19.0,19.0,18.0,18.0,17.0,16.0,16.0,18.0,19.0,21.0,...,0,1,1,1,1,1,1,1,40.127,116.655
6,14.0,14.0,13.0,13.0,13.0,13.0,13.0,15.0,19.0,20.0,...,0,1,1,1,1,1,1,1,40.369999,116.831999
7,16.0,15.0,14.0,14.0,13.0,13.0,12.0,14.0,16.0,19.0,...,0,1,1,1,1,1,1,1,40.453,115.971999
8,16.0,15.0,14.0,14.0,13.0,13.0,12.0,14.0,16.0,19.0,...,0,1,1,1,1,1,1,1,40.365,115.988
9,16.0,16.0,15.0,15.0,15.0,15.0,15.0,16.0,19.0,21.0,...,0,1,1,1,1,1,1,1,40.1,117.12


Unnamed: 0,0
0,34.0
1,41.0
2,38.0
3,28.0
4,30.0
5,40.0
6,48.0
7,33.0
8,41.0
9,49.0


In [38]:
display(train_input)

Unnamed: 0,PM25_Concentration_0t,PM25_Concentration_1t,PM25_Concentration_2t,PM25_Concentration_3t,PM25_Concentration_4t,PM25_Concentration_5t,PM25_Concentration_6t,PM25_Concentration_7t,PM25_Concentration_8t,PM25_Concentration_9t,...,wind_direction_24.0_16t,wind_direction_24.0_17t,wind_direction_24.0_18t,wind_direction_24.0_19t,wind_direction_24.0_20t,wind_direction_24.0_21t,wind_direction_24.0_22t,wind_direction_24.0_23t,latitude,longitude
0,138.0,124.0,127.0,129.0,119.0,127.0,124.0,120.0,123.0,147.0,...,0,1,1,1,1,1,1,1,40.090679,116.173553
1,89.0,85.0,88.0,100.0,109.0,120.0,127.0,114.0,121.0,133.0,...,0,1,1,1,1,1,1,1,40.003950,116.205310
2,98.0,107.0,115.0,123.0,129.0,138.0,146.0,144.0,151.0,168.0,...,0,1,1,1,1,1,1,1,39.815128,116.171150
3,109.0,101.0,102.0,108.0,115.0,123.0,142.0,146.0,130.0,158.0,...,0,1,1,1,1,1,1,1,39.742767,116.136045
4,88.0,105.0,114.0,118.0,130.0,140.0,142.0,150.0,139.0,158.0,...,0,1,1,1,1,1,1,1,39.982053,116.397400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,110.0,105.0,101.0,107.0,110.0,126.0,160.0,166.0,183.0,189.0,...,1,1,1,1,1,1,1,1,40.328000,116.628000
76,90.0,83.0,97.0,94.0,117.0,128.0,133.0,147.0,166.0,171.0,...,1,1,1,1,1,1,1,1,40.292000,116.220000
77,88.0,85.0,78.0,92.0,85.0,86.0,100.0,119.0,137.0,152.0,...,1,1,1,1,1,1,1,1,40.499000,116.911000
78,120.0,125.0,131.0,139.0,155.0,167.0,180.0,149.0,135.0,123.0,...,1,1,1,1,1,1,1,1,39.712000,116.783000
