In [2]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
import warnings
from IPython.display import clear_output
from multiprocessing import Pool
from time import time
warnings.filterwarnings('ignore')

In [3]:
time_window = 24

### Functions

In [17]:
def get_train_features(i):
    # For train data
    station_metaq_data = []
    station_dist_data = []
    local_met_data = []
    local_aq_data = []
    local_stations = []
    
    tmp_df = train_data.loc[time_range[i:i+time_window]]
    for station in train_stations:
        # Station side
        station_side = tmp_df[tmp_df.station_id != station]
        station_met_aq = station_side.drop(columns=['station_id', 'longitude', 'latitude', 'filled'])
#         clear_output(wait=True)
#         print('station_features', station_met_aq.columns)
        station_met_aq2 = np.array(np.split(station_met_aq.values, time_window, axis=0)).swapaxes(0,1).swapaxes(1,2)[np.newaxis, :]
        station_metaq_data.append(station_met_aq2)
        
        # Local side
        local_side = tmp_df[tmp_df.station_id == station]
        local_stations.append(local_side['station_id'].values[-1].reshape(-1,1))
        local_met = local_side.drop(columns=['station_id', 'longitude', 'latitude', 'PM25_Concentration', 'filled']).values.swapaxes(0,1)[np.newaxis, :]
        local_met_data.append(local_met)
        local_aq = local_side['PM25_Concentration'].values[-1].reshape(-1,1)
        local_aq_data.append(local_aq)
        
        station_dist = (station_side.drop_duplicates('station_id')[['longitude', 'latitude']].values -\
        local_side.drop_duplicates('station_id')[['longitude', 'latitude']].values)[np.newaxis, :]
        station_dist_data.append(station_dist)
    return [np.concatenate(station_metaq_data), 
            np.concatenate(station_dist_data), 
            np.concatenate(local_met_data), 
            np.concatenate(local_aq_data),
            np.concatenate(local_stations)]

def get_test_features(i):
    # For test data
    station_metaq_data = []
    station_dist_data = []
    local_met_data = []
    local_aq_data = []
    local_stations = []
    
    tmp_df_tst = test_data.loc[time_range[i:i+time_window]]
    
    station_side = train_data.loc[time_range[i:i+time_window]]
    station_met_aq = station_side.drop(columns=['station_id', 'longitude', 'latitude', 'filled'])
#     clear_output(wait=True)
#     print('station_features', station_met_aq.columns)
    station_met_aq2 = np.array(np.split(station_met_aq.values, time_window, axis=0)).swapaxes(0,1).swapaxes(1,2)[np.newaxis, :]
    
    for station in test_stations:
        station_metaq_data.append(station_met_aq2)
        
        # Local side
        local_side = tmp_df_tst[tmp_df_tst.station_id == station]
        local_stations.append(local_side['station_id'].values[-1].reshape(-1,1))
        local_met = local_side.drop(columns=['station_id', 'longitude', 'latitude', 
                                             'PM25_Concentration', 'filled']).values.swapaxes(0,1)[np.newaxis, :]
        local_met_data.append(local_met)
#         print('local_features', local_met.columns)
        local_aq = local_side['PM25_Concentration'].values[-1].reshape(-1,1)
        local_aq_data.append(local_aq)
        
        station_dist = (station_side.drop_duplicates('station_id')[['longitude', 'latitude']].values -\
        local_side.drop_duplicates('station_id')[['longitude', 'latitude']].values)[np.newaxis, :]
        station_dist_data.append(station_dist)
    
    return [np.concatenate(station_metaq_data), 
            np.concatenate(station_dist_data), 
            np.concatenate(local_met_data), 
            np.concatenate(local_aq_data),
            np.concatenate(local_stations)]

### Execute (notice we choose only a month worth data)

In [20]:
init = time()
fold=2 # not using for loop to avoid ram overflow
print('fold', fold)

train_data = pd.read_csv('../data/processed/fold_'+str(fold)+'_train_mar.csv.gz')
train_data['time'] = pd.to_datetime(train_data['time'])
train_data = train_data.set_index('time').sort_values(['time', 'station_id'])['2015-03-01':'2015-03-31']
test_data = pd.read_csv('../data/processed/fold_'+str(fold)+'_test_mar.csv.gz')
test_data['time'] = pd.to_datetime(test_data['time'])
test_data = test_data.set_index('time').sort_values(['time', 'station_id'])['2015-03-01':'2015-03-31']

# cols_to_scale = list(train_data.columns)
cols_to_scale = ['temperature','latitude','longitude','wind_speed','humidity']
# cols_to_scale.remove('station_id')
# cols_to_scale.remove('time')
# cols_to_scale.remove('PM25_Concentration')
scaler =  RobustScaler().fit(train_data[cols_to_scale])
train_data[cols_to_scale] = scaler.transform(train_data[cols_to_scale])
test_data[cols_to_scale] = scaler.transform(test_data[cols_to_scale])
                                             
train_stations = train_data.station_id.unique()
test_stations = test_data.station_id.unique()

time_range = train_data.index.unique()
workers = Pool(26)
train_combo = workers.map(get_train_features, range(len(time_range)-24+1))
print('train finished')
test_combo = workers.map(get_test_features, range(len(time_range)-24+1))
print('test finished')
workers.close()

for combo_data, name in zip([train_combo, test_combo], ['train', 'test']):
#     station_metaq_data = np.concatenate([combo[0] for combo in combo_data])
#     station_dist_data = np.concatenate([combo[1] for combo in combo_data])
#     local_met_data = np.concatenate([combo[2] for combo in combo_data])
#     local_aq_data = np.concatenate([combo[3] for combo in combo_data])
      local_stations = np.concatenate([combo[4] for combo in combo_data])
#     np.savez('../data/adain/fold_'+str(fold)+'_station_metaq_data_'+name, station_metaq_data)
#     np.savez('../data/adain/fold_'+str(fold)+'_station_dist_data_'+name, station_dist_data)
#     np.savez('../data/adain/fold_'+str(fold)+'_local_met_data_'+name, local_met_data)
#     np.savez('../data/adain/fold_'+str(fold)+'_local_aq_data_'+name, local_aq_data)
      np.savez('../data/adain/fold_'+str(fold)+'_local_stationids_'+name, local_stations)

#     del station_metaq_data
#     del local_met_data

print((time()-init)/60, 'minutes')

fold 2
train finished
test finished
0.32884647448857623 minutes
