In [1]:
import configparser
from functools import reduce
import os
import sys

import pandas as pd

In [2]:
import logging
logging.basicConfig(level=logging.DEBUG, 
                    format='%(levelname)-8s [%(lineno)d] %(message)s')

In [4]:
config = configparser.ConfigParser()
config.read('config.ini')
df_dir = config['DEFAULT']['ProjectDirectory']+'dataframe/'

In [6]:
def read_series_or_df(filen):
    data = pd.read_pickle(filen)
    if isinstance(data, pd.Series):
        return data.to_frame(name='freq')
    elif isinstance(data, pd.DataFrame):
        return data.reset_index().set_index(['nsubj', 'ROOT', 'dobj'])
    else:
        raise NotImplementedError

In [27]:
def sum_hunderd_files(i=0):
    """
    13 min
    """
    df = None
    for j in range(10):
        logging.info(f'Adding freqx{j}{i}')
        for k in range(10):
            filen = os.path.join(df_dir, f'freq{k}{j}{i}.pkl')
            if not os.path.isfile(filen):
                logging.warning('File {} not exists.'.format(filen))
                continue
            df0 = read_series_or_df(filen)
            if df is None:
                df = df0
            else:
                df += df0.reindex(df.index, fill_value=0)
    logging.info('Pickling dataframe..')
    df = df.astype(int)
    df = df.sort_values('freq', ascending=False)
    df.to_pickle(os.path.join(df_dir, f'freq{i}.pkl'))
    return df

In [31]:
def sum_second_half():
    df = None
    for i in range(5, 10):
        basen = f'freq{i}.pkl'
        logging.info(f'Adding {basen}..')
        filen = os.path.join(df_dir, basen)
        if not os.path.isfile(filen):
            logging.warning('File {} not exists.'.format(filen))
            continue
        df0 = read_series_or_df(filen)
        if df is None:
            df = df0
        else:
            df += df0.reindex(df.index, fill_value=0)
    logging.info('Pickling dataframe..')
    df = df.astype(int)
    df = df.sort_values('freq', ascending=False)
    df.to_pickle(os.path.join(df_dir, f'freq5to9.pkl'))
    return df

In [5]:
def sum_ten_files(common_suff):
    filen = os.path.join(df_dir, 'freq0{}.pkl').format(common_suff)
    df = read_series_or_df(filen)
    for i in range(1, 10):
        filen = os.path.join(df_dir, 'freq{}{}.pkl').format(i, common_suff)
        if not os.path.isfile(filen):
            logging.warning('File {} not exists.'.format(filen))
            continue
        df0 = read_series_or_df(filen)
        df += df0.reindex(df.index, fill_value=0)
    logging.info('Pickling dataframe..')
    df = df.astype(int)
    df = df.sort_values('freq', ascending=False)
    df.to_pickle(os.path.join(df_dir, 'freq{}.pkl').format(common_suff))
    return df