# Import Files 

This notebook is used to read in the train or test csv and split into new files based on building and meter type.

In [2]:
import os
import os.path as op
from tqdm import tqdm

import numpy as np
import pandas as pd

In [3]:
# Directory where the input .csv is located
in_path = 'ashrae-energy-prediction'

# Output is the path where the split csv files will be save - note the class_0 is needed for the DatasetFolder class to work but isn't meaningful
out_path = 'ashrae-energy-prediction/dataloader_test_meteronly/csv/class_0'
#out_path = 'ashrae-energy-prediction/dataloader_train_meteronly/csv/class_0'

In [4]:
# Create output directory if it doesn't exist

if not op.isdir(out_path):
    os.makedirs(out_path)

## Choose file to split

In [5]:
%%time


#df = pd.read_csv(op.join(in_path, 'train.csv'))
df = pd.read_csv(op.join(in_path, 'test.csv'))


display(df)

Unnamed: 0,row_id,building_id,meter,timestamp
0,0,0,0,2017-01-01 00:00:00
1,1,1,0,2017-01-01 00:00:00
2,2,2,0,2017-01-01 00:00:00
3,3,3,0,2017-01-01 00:00:00
4,4,4,0,2017-01-01 00:00:00
...,...,...,...,...
41697595,41697595,1444,0,2018-05-09 07:00:00
41697596,41697596,1445,0,2018-05-09 07:00:00
41697597,41697597,1446,0,2018-05-09 07:00:00
41697598,41697598,1447,0,2018-05-09 07:00:00


CPU times: user 12.8 s, sys: 3.01 s, total: 15.8 s
Wall time: 14.9 s


In [8]:
# For loop to write each building_id/meter to its own file
#Splits on unique building ids and then meter type

bids = list(df.building_id.unique())
pbar = tqdm(bids, miniters = 10, leave = True, ncols = 70)

cnt = 0
cols = ['timestamp', 'row_id']

for bid in pbar:
    df_bid = df[df.building_id == bid]
    
    mids = list(df_bid.meter.unique())
    
    for mid in mids:
        df_meter = df_bid[df_bid.meter == mid]
        df_meter = df_meter[cols]
        #df_meter['meter_reading'] = np.expm1(df_meter['meter_reading'])
        #df_meter.set_index('timestamp', inplace =True)
        
        df_meter.to_csv(op.join(out_path, 'test_{:04d}.csv'.format(cnt)), index = True)
        cnt += 1


  0%|                                        | 0/1449 [00:00<?, ?it/s][A
  1%|▏                              | 10/1449 [00:00<02:20, 10.22it/s][A
  1%|▍                              | 20/1449 [00:01<02:20, 10.15it/s][A
  2%|▋                              | 30/1449 [00:02<02:19, 10.19it/s][A
  3%|▊                              | 40/1449 [00:03<02:17, 10.26it/s][A
  3%|█                              | 50/1449 [00:04<02:12, 10.53it/s][A
  4%|█▎                             | 60/1449 [00:05<02:12, 10.48it/s][A
  5%|█▍                             | 70/1449 [00:06<02:09, 10.63it/s][A
  6%|█▋                             | 80/1449 [00:07<02:10, 10.46it/s][A
  6%|█▉                             | 90/1449 [00:08<02:07, 10.63it/s][A
  7%|██                            | 100/1449 [00:09<02:13, 10.11it/s][A
  8%|██▎                           | 110/1449 [00:10<02:13, 10.03it/s][A
  8%|██▍                           | 120/1449 [00:11<02:15,  9.81it/s][A
  9%|██▋                           | 

In [9]:
for col in df_meter.columns:
    print(col, df_meter[col].dtype, np.issubdtype(df_meter[col].dtype, np.number))

timestamp object False
row_id int64 True


In [10]:
l = list(map(lambda x : np.issubdtype(df[x].dtype, np.number), df.columns))

In [11]:
df_meter.iloc[0:4]

Unnamed: 0,timestamp,row_id
37198569,2017-01-01 01:00:00,37198569
37198819,2017-01-01 02:00:00,37198819
37199069,2017-01-01 03:00:00,37199069
37199319,2017-01-01 04:00:00,37199319
