In [7]:
import ast
import os
import time

import pandas as pd
import numpy as np

In [8]:
def explode_dictionary(df, col_name, replace=False):
    """
        Convert columns with nested JSON fields into multiple columns for each key/value pair
        Default will return dataframe with old and new columns concatenated
    """
    df.loc[df[col_name].isnull(), col_name] = '{}'
    
    df[col_name] = df[col_name].apply(ast.literal_eval)
    
    new_df = pd.json_normalize(df[col_name])
    new_df.columns = [col_name + '_' + x for x in new_df.columns]
        
    if replace:
        cols = [x for x in df.columns if x != col_name]
        return pd.concat([df[cols], new_df], axis=1)
    else:
        return pd.concat([df, new_df], axis=1)

In [9]:
def separate_column(df, col_name, separator):
    """
        Remove prefixes unnecessary for analysis (usually base URLs)
    """
    df[col_name] = df[col_name].str.split(separator).str[-1]
    return df

In [10]:
def clean_column(df, col_name):
    """
        Wrapper for previous cleaning functions
    """
    if col_name == 'contained':
        df = explode_dictionary(df, col_name)
        if set(['contained_actorId', 'contained_verb']).issubset(df.columns):
            df = separate_column(df, 'contained_actorId', '|')
            df = separate_column(df, 'contained_verb', '/')
        else:
            df['contained_actorId'] = pd.Series(dtype=str)
            df['contained_verb'] = pd.Series(dtype=str)
            
    elif col_name in ['verb']:
        df = separate_column(df, col_name, '/')
    
    return df

In [11]:
def column_generator(file_list, col_name, col_type):
    """
        Generator to process each json file separately and keep memory clear.
    """
    for f in file_list:
        # Import single column, empty if missing
        try: 
            df = pd.read_csv(data_dir + f, 
                             usecols=['id', col_name],
                             dtype={'id':str, col_name:col_type})
        except:
            df = pd.read_csv(data_dir + f,
                             usecols = ['id'],
                                 dtype={'id':str})
            df[col_name] = pd.Series(dtype=col_type)

        df['date'] = f.replace('.csv', '')
        df = df[['date', 'id', col_name]]

        # Formatting (overrides existing column)
        df = clean_column(df, col_name)
        
        yield df

In [12]:
data_dir = './data/'
file_list = [file for file in os.listdir(data_dir) if file.endswith('.csv')]
file_list = file_list[:5]

In [17]:
t1 = time.perf_counter()
data = [x for x in column_generator(file_list, 'contained', str)]
print(time.perf_counter() - t1)

21.35816670001077


In [18]:
df = pd.concat(data).reset_index(drop=True)
df.head()

Unnamed: 0,date,id,contained,contained_actorId,contained_verb
0,2021-07-01,1960ccd0-e04d-11eb-a053-a183b5fba7a3,{},,
1,2021-07-01,1b2c16f0-e04d-11eb-a053-a183b5fba7a3,{},,
2,2021-07-01,1b4fcb90-e04d-11eb-a053-a183b5fba7a3,{},,
3,2021-07-01,1b60bb80-e04d-11eb-a053-a183b5fba7a3,{},,
4,2021-07-01,1e168cb0-e04d-11eb-a053-a183b5fba7a3,{},,
