In [2]:
import pandas as pd
import numpy as np
import os

Data is supposed to be in a directory `Data/` containing subdirectories `annonceur1/` and `annonceur2` that contain the corresponding files.

## 1. Reduce memory usage
`reduce_mem_usage` function for reducing memory usage by downgrading data types:

source : https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

In [3]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print('\n')
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        print('col', col)
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

## 2. Generate HDF

HDF format allows storing of mutliple pd.DataFrame objects into one file, each pd.DataFrame is identified by a unique key

In the following example, data from annonceur{i} is gathered into `annonceur{i}.hdf`

In [155]:
for file in [
    'annonceur1/annonceur1_campaign1_visite_2pages.csv',
    'annonceur1/annonceur1_campaign1_visite_engagee.csv',
    'annonceur1/annonceur1_campaign2_visite_2pages.csv',
    'annonceur1/annonceur1_campaign2_visite_engagee.csv',
    'annonceur1/annonceur1_campaign3_visite_2pages.csv',
    'annonceur1/annonceur1_campaign3_visite_engagee.csv',
    'annonceur1/annonceur1_campaign4_visite_2pages.csv',
    'annonceur1/annonceur1_campaign4_visite_engagee.csv'
]:
    key = file.replace('annonceur1/', '').replace('.csv', '')
    df = pd.read_csv(f'Data/{file}', usecols=[1, 2, 3, 4])
    reduce_mem_usage(df)
    df.to_hdf('Data/annonceur1/annonceur1.hdf', key=key, complevel=9)
    # Also create daily hdf
    df.loc[:, 'date'] = pd.to_datetime(
        df['impression_date'].str.split(' ').str[0],
        format='%Y-%m-%d'
    )
    daily = df.groupby(['date', 'group', 'view', 'is_conv']).size().rename('count')
    daily.to_hdf('Data/annonceur1/annonceur1_daily.hdf', key=key, complevel=9)

Memory usage of properties dataframe is : 60.19038391113281  MB
col impression_date
col view
******************************
Column:  view
dtype before:  int64
dtype after:  uint8
******************************
col group
col is_conv
******************************
Column:  is_conv
dtype before:  int64
dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  33.85712432861328  MB
This is  56.25005545503934 % of the initial size
Memory usage of properties dataframe is : 60.19038391113281  MB
col impression_date
col view
******************************
Column:  view
dtype before:  int64
dtype after:  uint8
******************************
col group
col is_conv
******************************
Column:  is_conv
dtype before:  int64
dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  33.85712432861328  MB
This is  56.25005545503934 % of the initial size
Memory usage of properties dataframe is : 16

In [154]:
for file in [
    'annonceur2/annonceur2_campaign1_achat.csv',
    'annonceur2/annonceur2_campaign1_visite_page_produit.csv',
    'annonceur2/annonceur2_campaign1_visite_panier.csv'
]:
    key = file.replace('annonceur2/', '').replace('.csv', '')
    df = pd.read_csv(f'Data/{file}', usecols=[1, 2, 3, 4])
    reduce_mem_usage(df)
    df.to_hdf('Data/annonceur2/annonceur2.hdf', key=key, complevel=9)
    
    # Also create daily hdf
    df.loc[:, 'date'] = pd.to_datetime(
        df['impression_date'].str.split(' ').str[0],
        format='%Y-%m-%d'
    )
    daily = df.groupby(['date', 'group', 'view', 'is_conv']).size().rename('count')
    daily.to_hdf('Data/annonceur2/annonceur2_daily.hdf', key=key, complevel=9)

Memory usage of properties dataframe is : 377.19618225097656  MB
col impression_date
col view
******************************
Column:  view
dtype before:  int64
dtype after:  uint8
******************************
col group
col is_conv
******************************
Column:  is_conv
dtype before:  int64
dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  212.1728858947754  MB
This is  56.25000884913545 % of the initial size
Memory usage of properties dataframe is : 377.19618225097656  MB
col impression_date
col view
******************************
Column:  view
dtype before:  int64
dtype after:  uint8
******************************
col group
col is_conv
******************************
Column:  is_conv
dtype before:  int64
dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  212.1728858947754  MB
This is  56.25000884913545 % of the initial size
Memory usage of properties dataframe is : 

## 3. Example for loading one particular DataFrame :

Load brut data :

In [195]:
df = pd.read_hdf('Data/annonceur2/annonceur2.hdf', key='annonceur2_campaign1_visite_panier')

In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12359962 entries, 0 to 12359961
Data columns (total 4 columns):
impression_date    object
view               uint8
group              object
is_conv            uint8
dtypes: object(2), uint8(2)
memory usage: 306.5+ MB


Load only daily (aggregated) data :

In [7]:
daily = pd.read_hdf('Data/annonceur2/annonceur2_daily.hdf', key='annonceur2_campaign1_visite_panier')
daily

date        group  view  is_conv
2018-02-01  A      0     0          134300
                   1     0           43549
                         1               1
            B      0     0          148031
                   1     0           45630
2018-02-02  A      0     0          159673
                   1     0           42602
            B      0     0          165170
                   1     0           44181
2018-02-03  A      0     0          186806
                   1     0           44659
                         1               1
            B      0     0          189884
                   1     0           45580
2018-02-04  A      0     0          177850
                   1     0           42578
                         1               1
            B      0     0          183020
                   1     0           43664
2018-02-05  A      0     0          142542
                   1     0           38191
                         1               1
            B      0 