# Concat 

In [1]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize':(12,6)})
plt.style.use(['seaborn-whitegrid'])

import warnings
warnings.simplefilter('ignore')

import os
import utils

Our `read_csv` function.

```python
    import pandas as pd

    def read_csv(filename):
        df = pd.read_csv(filename, skiprows=8)
        df.rename(columns={'# timestamp': 'timestamp'}, inplace=True)
        df['timestamp'] = pd.to_datetime(df['timestamp'])  #might not want this here
        df.set_index('timestamp', inplace=True)
        return df
```

In [2]:
filenames = ['data/measured_real_power.csv',
             'data/measured_reactive_power.csv']

Concat

In [3]:
df1 = utils.read_csv(filenames[0])
df2 = utils.read_csv(filenames[1])

df = pd.concat([df1, df2], axis=1) # stack side by side

In [4]:
df1.shape, df2.shape, df.shape

((12961, 20), (12961, 20), (12961, 40))

In [5]:
len(set(df.columns))

20

In [6]:
all(df1.columns == df2.columns)

True

So now we need to capture some additional information.

In [7]:
def read_csv(filename):
    df = pd.read_csv(filename, skiprows=8)
    df.rename(columns={'# timestamp': 'timestamp'}, inplace=True)
    df.timestamp = pd.to_datetime(df.timestamp)
    df.set_index('timestamp', inplace=True)
    prefix = os.path.basename(filename).replace('.csv','')
    df.columns = [ "{}:{}".format(prefix, x) for x in df.columns ]
    return df

df = pd.concat([ read_csv(x) for x in filenames ], axis=1)
len(set(df.columns))

40

In [8]:
df.columns

Index(['measured_real_power:triplex_meter_0',
       'measured_real_power:triplex_meter_1',
       'measured_real_power:triplex_meter_2',
       'measured_real_power:triplex_meter_3',
       'measured_real_power:triplex_meter_4',
       'measured_real_power:triplex_meter_5',
       'measured_real_power:triplex_meter_6',
       'measured_real_power:triplex_meter_7',
       'measured_real_power:triplex_meter_8',
       'measured_real_power:triplex_meter_9',
       'measured_real_power:triplex_meter_10',
       'measured_real_power:triplex_meter_11',
       'measured_real_power:triplex_meter_12',
       'measured_real_power:triplex_meter_13',
       'measured_real_power:triplex_meter_14',
       'measured_real_power:triplex_meter_15',
       'measured_real_power:triplex_meter_16',
       'measured_real_power:triplex_meter_17',
       'measured_real_power:triplex_meter_18',
       'measured_real_power:triplex_meter_19',
       'measured_reactive_power:triplex_meter_0',
       'measured_rea

## Glob 

In [10]:
import glob

In [13]:
filenames = glob.glob('data/outputFiles/*.csv')
filenames

['data/outputFiles/triplex_meter:monthly_bill.csv',
 'data/outputFiles/triplex_meter:measured_real_power.csv',
 'data/outputFiles/house:outdoor_temperature.csv',
 'data/outputFiles/house:heating_setpoint.csv',
 'data/outputFiles/house:hvac_power.imag.csv',
 'data/outputFiles/house:incident_solar_radiation.csv',
 'data/outputFiles/house:air_temperature.csv',
 'data/outputFiles/house:mass_temperature.csv',
 'data/outputFiles/house:cooling_setpoint.csv',
 'data/outputFiles/triplex_meter:measured_reactive_power.csv',
 'data/outputFiles/house:hvac_power.real.csv',
 'data/outputFiles/triplex_meter:price.csv',
 'data/outputFiles/triplex_meter:monthly_energy.csv']

In [15]:
def read_dir(filenames):
    return pd.concat([ read_csv(x) for x in filenames ], axis=1)

read_dir(filenames).shape

(12961, 260)

Takes a while... Can we do better?

In [15]:
import cProfile

cProfile.run('read_dir(filenames)')

         118750698 function calls (118748645 primitive calls) in 64.428 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      384    0.001    0.000    0.001    0.000 <frozen importlib._bootstrap>:103(release)
      384    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:143(__init__)
      384    0.000    0.000    0.003    0.000 <frozen importlib._bootstrap>:147(__enter__)
      384    0.000    0.000    0.001    0.000 <frozen importlib._bootstrap>:151(__exit__)
      384    0.001    0.000    0.002    0.000 <frozen importlib._bootstrap>:157(_get_module_lock)
      384    0.000    0.000    0.001    0.000 <frozen importlib._bootstrap>:176(cb)
  264/120    0.000    0.000    0.023    0.000 <frozen importlib._bootstrap>:211(_call_with_frames_removed)
     1800    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:222(_verbose_message)
      120    0.000    0.000    0.000    0.000 <frozen importlib._boots

      144    0.000    0.000    0.000    0.000 parse.py:109(_coerce_args)
       72    0.000    0.000    0.001    0.000 parse.py:361(urlparse)
       72    0.000    0.000    0.000    0.000 parse.py:394(urlsplit)
        1    0.000    0.000    0.000    0.000 parse.py:83(clear_cache)
      144    0.000    0.000    0.000    0.000 parse.py:98(_noop)
       24    0.000    0.000    0.021    0.001 parsers.py:1120(_make_engine)
       24    0.000    0.000    0.679    0.028 parsers.py:1137(read)
       24    0.000    0.000    0.000    0.000 parsers.py:1162(_create_index)
       72    0.000    0.000    0.000    0.000 parsers.py:1176(_is_index_col)
       72    0.000    0.000    0.000    0.000 parsers.py:1180(_is_potential_multi_index)
      144    0.000    0.000    0.000    0.000 parsers.py:1195(<genexpr>)
       24    0.000    0.000    0.000    0.000 parsers.py:1274(_validate_usecols_arg)
       24    0.000    0.000    0.000    0.000 parsers.py:1329(_validate_parse_dates_arg)
       24    0.000 

In [19]:
%%writefile utils.py
import pandas as pd
import os

def read_csv(filename):
    df = pd.read_csv(filename, skiprows=8)
    df.rename(columns={'# timestamp': 'timestamp'}, inplace=True)
    #df.timestamp = pd.to_datetime(df.timestamp) #hold off for now...
    df.set_index('timestamp', inplace=True)
    prefix = os.path.basename(filename).replace('.csv','')
    df.columns = [ "{}:{}".format(prefix, x) for x in df.columns ] 
    return df

def read_dir(filenames):
    df = pd.concat([ read_csv(x) for x in filenames ], axis=1, sort=False)
    df.index = pd.to_datetime(df.index)
    return df

Overwriting utils.py


In [17]:
# %load_ext autoreload
# %reload_ext autoreload

In [21]:
%autoreload 2
import utils

df = utils.read_dir(filenames)
df.shape

(12961, 260)

In [22]:
df.head()

Unnamed: 0_level_0,triplex_meter:monthly_bill:triplex_meter_0,triplex_meter:monthly_bill:triplex_meter_1,triplex_meter:monthly_bill:triplex_meter_2,triplex_meter:monthly_bill:triplex_meter_3,triplex_meter:monthly_bill:triplex_meter_4,triplex_meter:monthly_bill:triplex_meter_5,triplex_meter:monthly_bill:triplex_meter_6,triplex_meter:monthly_bill:triplex_meter_7,triplex_meter:monthly_bill:triplex_meter_8,triplex_meter:monthly_bill:triplex_meter_9,...,triplex_meter:monthly_energy:triplex_meter_10,triplex_meter:monthly_energy:triplex_meter_11,triplex_meter:monthly_energy:triplex_meter_12,triplex_meter:monthly_energy:triplex_meter_13,triplex_meter:monthly_energy:triplex_meter_14,triplex_meter:monthly_energy:triplex_meter_15,triplex_meter:monthly_energy:triplex_meter_16,triplex_meter:monthly_energy:triplex_meter_17,triplex_meter:monthly_energy:triplex_meter_18,triplex_meter:monthly_energy:triplex_meter_19
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-07-01 00:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-07-01 00:01:00+00:00,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,0.006402,0.006402,0.006402,0.008858,0.008858,0.008858,0.008847,0.008847,0.004519,0.006395
2013-07-01 00:02:00+00:00,10.0006,10.0006,10.0006,10.0006,10.0006,10.0006,10.0006,10.0006,10.0007,10.0007,...,0.006402,0.006402,0.006402,0.008858,0.008858,0.008858,0.008847,0.008847,0.004519,0.006395
2013-07-01 00:03:00+00:00,10.0012,10.0012,10.0012,10.0013,10.0012,10.0012,10.0013,10.0013,10.0013,10.0013,...,0.012805,0.012805,0.012805,0.017716,0.017716,0.017716,0.017695,0.017695,0.009037,0.012789
2013-07-01 00:04:00+00:00,10.0018,10.0018,10.0018,10.0019,10.0018,10.0018,10.0019,10.0019,10.002,10.002,...,0.019207,0.019207,0.019207,0.026575,0.026575,0.026575,0.026542,0.026542,0.013556,0.019184


## The stacked method 

In [31]:
filename = 'data/measured_real_power.csv'

df = pd.read_csv(filename, skiprows=8)
df.rename(columns={'# timestamp': 'timestamp'}, inplace=True)
df = df.melt(id_vars='timestamp')
df.head()

Unnamed: 0,timestamp,variable,value
0,2013-07-01 00:00:00 UTC,triplex_meter_0,510.912
1,2013-07-01 00:01:00 UTC,triplex_meter_0,510.912
2,2013-07-01 00:02:00 UTC,triplex_meter_0,510.912
3,2013-07-01 00:03:00 UTC,triplex_meter_0,510.912
4,2013-07-01 00:04:00 UTC,triplex_meter_0,510.911


In [34]:
prefix = os.path.basename(filename).replace('.csv','')
df['attribute'] = prefix
df.head()

Unnamed: 0,timestamp,variable,value,attribute
0,2013-07-01 00:00:00 UTC,triplex_meter_0,510.912,measured_real_power
1,2013-07-01 00:01:00 UTC,triplex_meter_0,510.912,measured_real_power
2,2013-07-01 00:02:00 UTC,triplex_meter_0,510.912,measured_real_power
3,2013-07-01 00:03:00 UTC,triplex_meter_0,510.912,measured_real_power
4,2013-07-01 00:04:00 UTC,triplex_meter_0,510.911,measured_real_power


We can stack this with concat

In [35]:
%%writefile utils.py
import pandas as pd
import os

def read_csv(filename):
    df = pd.read_csv(filename, skiprows=8)
    df.rename(columns={'# timestamp': 'timestamp'}, inplace=True)
    df = df.melt(id_vars='timestamp')
    prefix = os.path.basename(filename).replace('.csv','')
    df['attribute'] = prefix
    return df

def read_dir(filenames):
    df = pd.concat([ read_csv(x) for x in filenames ], axis=0, sort=False)  # axis = 0 for stacking
    #df.index = pd.to_datetime(df.index)
    return df

Overwriting utils.py


In [36]:
%autoreload 2
import utils

df = utils.read_dir(filenames)
df.shape

(3369860, 4)

In [47]:
# takes a really long time....
#df.timestamp = pd.to_datetime(df.timestamp)

Reshaping can help...

In [46]:
tmp = df.pivot_table(index='timestamp', columns=['variable', 'attribute'], values='value')
print(tmp.shape)
df.index = pd.to_datetime(df.index)
final = tmp.reset_index().melt(id_vars='timestamp')
final.shape

(12961, 260)


(3369860, 4)

In [48]:
%%writefile utils.py
import pandas as pd
import os

def read_csv(filename):
    df = pd.read_csv(filename, skiprows=8)
    df.rename(columns={'# timestamp': 'timestamp'}, inplace=True)
    df = df.melt(id_vars='timestamp')
    prefix = os.path.basename(filename).replace('.csv','')
    df['attribute'] = prefix
    return df

def read_dir(filenames):
    df = pd.concat([ read_csv(x) for x in filenames ], axis=0, sort=False)  # axis = 0 for stacking
    #df.index = pd.to_datetime(df.index)
    tmp = df.pivot_table(index='timestamp', columns=['variable', 'attribute'], values='value')
    tmp.index = pd.to_datetime(tmp.index)
    final = tmp.reset_index().melt(id_vars='timestamp')
    return final

Overwriting utils.py
