# Concat 

In [29]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize':(12,6)})
plt.style.use(['seaborn-whitegrid'])

import warnings
warnings.simplefilter('ignore')

import os
import utils

Our `read_csv` function.

```python
    import pandas as pd

    def read_csv(filename):
        df = pd.read_csv(filename, skiprows=8)
        df.rename(columns={'# timestamp': 'timestamp'}, inplace=True)
        df['timestamp'] = pd.to_datetime(df['timestamp'])  #might not want this here
        df.set_index('timestamp', inplace=True)
        return df
```

In [30]:
filenames = ['data/measured_real_power.csv',
             'data/measured_reactive_power.csv']

Concat

In [31]:
df1 = utils.read_csv(filenames[0])
df2 = utils.read_csv(filenames[1])

df = pd.concat([df1, df2], axis=1) # stack side by side

In [32]:
df1.shape, df2.shape, df.shape

((259220, 4), (259220, 4), (259220, 8))

In [33]:
len(set(df.columns))

4

In [34]:
all(df1.columns == df2.columns)

True

So now we need to capture some additional information, the **prefix**

In [35]:
def read_csv(filename):
    df = pd.read_csv(filename, skiprows=8)
    df.rename(columns={'# timestamp': 'timestamp'}, inplace=True)
    df.timestamp = pd.to_datetime(df.timestamp)
    df.set_index('timestamp', inplace=True)
    prefix = os.path.basename(filename).replace('.csv','')
    df.columns = [ "{}:{}".format(prefix, x) for x in df.columns ]
    return df

df = pd.concat([ read_csv(x) for x in filenames ], axis=1)
len(set(df.columns))

40

In [36]:
df.columns

Index(['measured_real_power:triplex_meter_0',
       'measured_real_power:triplex_meter_1',
       'measured_real_power:triplex_meter_2',
       'measured_real_power:triplex_meter_3',
       'measured_real_power:triplex_meter_4',
       'measured_real_power:triplex_meter_5',
       'measured_real_power:triplex_meter_6',
       'measured_real_power:triplex_meter_7',
       'measured_real_power:triplex_meter_8',
       'measured_real_power:triplex_meter_9',
       'measured_real_power:triplex_meter_10',
       'measured_real_power:triplex_meter_11',
       'measured_real_power:triplex_meter_12',
       'measured_real_power:triplex_meter_13',
       'measured_real_power:triplex_meter_14',
       'measured_real_power:triplex_meter_15',
       'measured_real_power:triplex_meter_16',
       'measured_real_power:triplex_meter_17',
       'measured_real_power:triplex_meter_18',
       'measured_real_power:triplex_meter_19',
       'measured_reactive_power:triplex_meter_0',
       'measured_rea

## Glob 

In [37]:
import glob

Create some additional files for scaling demonstration

In [42]:
import os
import shutil

if os.path.exists("data/outputFiles"):
    shutil.rmtree("data/outputFiles")
os.mkdir("data/outputFiles")

for i in range(10):
    shutil.copyfile("data/measured_reactive_power.csv", 
                    "data/outputFiles/measured_reactive_power_{}.csv".format(i))
    shutil.copyfile("data/measured_real_power.csv", 
                    "data/outputFiles/measured_real_power_{}.csv".format(i))

In [43]:
filenames = glob.glob('data/outputFiles/*.csv')
filenames

['data/outputFiles/measured_reactive_power_9.csv',
 'data/outputFiles/measured_real_power_8.csv',
 'data/outputFiles/measured_real_power_9.csv',
 'data/outputFiles/measured_reactive_power_8.csv',
 'data/outputFiles/measured_reactive_power_6.csv',
 'data/outputFiles/measured_real_power_7.csv',
 'data/outputFiles/measured_real_power_6.csv',
 'data/outputFiles/measured_reactive_power_7.csv',
 'data/outputFiles/measured_reactive_power_5.csv',
 'data/outputFiles/measured_real_power_4.csv',
 'data/outputFiles/measured_real_power_5.csv',
 'data/outputFiles/measured_reactive_power_4.csv',
 'data/outputFiles/measured_reactive_power_0.csv',
 'data/outputFiles/measured_real_power_1.csv',
 'data/outputFiles/measured_real_power_0.csv',
 'data/outputFiles/measured_reactive_power_1.csv',
 'data/outputFiles/measured_reactive_power_3.csv',
 'data/outputFiles/measured_real_power_2.csv',
 'data/outputFiles/measured_real_power_3.csv',
 'data/outputFiles/measured_reactive_power_2.csv']

In [44]:
def read_dir(filenames):
    return pd.concat([ read_csv(x) for x in filenames ], axis=1)

read_dir(filenames).shape

(12961, 400)

Takes a while... Can we do better?

In [None]:
import cProfile

cProfile.run('read_dir(filenames)')

In [None]:
%%writefile utils.py
import pandas as pd
import os

def read_csv(filename):
    df = pd.read_csv(filename, skiprows=8)
    df.rename(columns={'# timestamp': 'timestamp'}, inplace=True)
    #df.timestamp = pd.to_datetime(df.timestamp) #hold off for now...
    df.set_index('timestamp', inplace=True)
    prefix = os.path.basename(filename).replace('.csv','')
    df.columns = [ "{}:{}".format(prefix, x) for x in df.columns ] 
    return df

def read_dir(filenames):
    df = pd.concat([ read_csv(x) for x in filenames ], axis=1, sort=False)
    df.index = pd.to_datetime(df.index)
    return df

In [None]:
%load_ext autoreload
%reload_ext autoreload

In [None]:
%autoreload 2
import utils

df = utils.read_dir(filenames)
df.shape

In [None]:
df.head()

## The stacked method 

In [None]:
filename = 'data/measured_real_power.csv'

df = pd.read_csv(filename, skiprows=8)
df.rename(columns={'# timestamp': 'timestamp'}, inplace=True)
df = df.melt(id_vars='timestamp')
df.head()

In [None]:
prefix = os.path.basename(filename).replace('.csv','')
df['attribute'] = prefix
df.head()

We can stack this with concat

In [None]:
%%writefile utils.py
import pandas as pd
import os

def read_csv(filename):
    df = pd.read_csv(filename, skiprows=8)
    df.rename(columns={'# timestamp': 'timestamp'}, inplace=True)
    df = df.melt(id_vars='timestamp')
    prefix = os.path.basename(filename).replace('.csv','')
    df['attribute'] = prefix
    return df

def read_dir(filenames):
    df = pd.concat([ read_csv(x) for x in filenames ], axis=0, sort=False)  # axis = 0 for stacking
    #df.index = pd.to_datetime(df.index)
    return df

In [None]:
%autoreload 2
import utils

df = utils.read_dir(filenames)
df.shape

In [None]:
# takes a really long time....
#df.timestamp = pd.to_datetime(df.timestamp)

Reshaping can help...

In [None]:
tmp = df.pivot_table(index='timestamp', columns=['variable', 'attribute'], values='value')
print(tmp.shape)
df.index = pd.to_datetime(df.index)
final = tmp.reset_index().melt(id_vars='timestamp')
final.shape

In [None]:
%%writefile utils.py
import pandas as pd
import os

def read_csv(filename):
    df = pd.read_csv(filename, skiprows=8)
    df.rename(columns={'# timestamp': 'timestamp'}, inplace=True)
    df = df.melt(id_vars='timestamp')
    prefix = os.path.basename(filename).replace('.csv','')
    df['attribute'] = prefix
    return df

def read_dir(filenames):
    df = pd.concat([ read_csv(x) for x in filenames ], axis=0, sort=False)  # axis = 0 for stacking
    #df.index = pd.to_datetime(df.index)
    tmp = df.pivot_table(index='timestamp', columns=['variable', 'attribute'], values='value')
    tmp.index = pd.to_datetime(tmp.index)
    final = tmp.reset_index().melt(id_vars='timestamp')
    return final