In [18]:
import configparser
from functools import reduce
import os
import sys

import pandas as pd

In [2]:
import logging
logging.basicConfig(level=logging.DEBUG, 
                    format='%(levelname)-8s [%(lineno)d] %(message)s')

In [32]:
config = configparser.ConfigParser()
config.read('config.ini')
df_dir = config['DEFAULT']['ProjectDirectory']+'dataframe/'

In [15]:
def read_series_or_df(filen):
    data = pd.read_pickle(filen)
    if isinstance(data, pd.Series):
        return data.to_frame(name='freq')
    elif isinstance(data, pd.DataFrame):
        return data.reset_index().set_index(['nsubj', 'ROOT', 'dobj'])
    else:
        raise NotImplementedError



def sum_frames(common_suff):
    filen = os.path.join(df_dir, 'freq0{}.pkl').format(common_suff)
    df = read_series_or_df(filen)
    for i in range(1, 10):
        filen = os.path.join(df_dir, 'freq{}{}.pkl').format(i, common_suff)
        if not os.path.isfile(filen):
            logging.warning('File {} not exists.'.format(filen))
            continue
        df0 = read_series_or_df(filen)
        df += df0.reindex(df.index, fill_value=0)
    logging.info('Pickling dataframe..')
    df = df.astype(int)
    df = df.sort_values('freq', ascending=False)
    df.to_pickle(os.path.join(df_dir, 'freq{}.pkl').format(common_suff))
    return df


def sum_frames_temp():
    """
    Creating freq0 takes 10 min.
    """
    filen = os.path.join(df_dir, 'freq000.pkl')
    df = read_series_or_df(filen)
    for i in range(1, 10):
        logging.info(f'Reading {i}0x..')

        for j in range(1, 10):
            filen = os.path.join(df_dir, f'freq{i}0{j}.pkl')
            if not os.path.isfile(filen):
                logging.warning('File {} not exists.'.format(filen))
                continue
            df0 = read_series_or_df(filen)
            df += df0.reindex(df.index, fill_value=0)
    logging.info('Pickling dataframe..')
    df = df.astype(int)
    df = df.sort_values('freq', ascending=False)
    df.to_pickle(os.path.join(df_dir, 'freqx0x.pkl'))
    return df

In [16]:
%time df = sum_frames_temp()

INFO     [36] Reading 10x..
INFO     [36] Reading 20x..
INFO     [36] Reading 30x..
INFO     [36] Reading 40x..
INFO     [36] Reading 50x..
INFO     [36] Reading 60x..
INFO     [36] Reading 70x..
INFO     [36] Reading 80x..
INFO     [36] Reading 90x..
INFO     [45] Pickling dataframe..


CPU times: user 10min 38s, sys: 38.6 s, total: 11min 17s
Wall time: 11min 28s


In [17]:
df.reset_index(inplace=True)
df[(df.dobj!='')&(df.nsubj!='')][['nsubj', 'ROOT', 'dobj', 'freq']]

Unnamed: 0,nsubj,ROOT,dobj,freq
175,it,be,what,391429
178,I,tell,you,389719
184,I,do,what,379494
201,you,do,what,357092
252,you,have,question,288926
...,...,...,...,...
7238035,Asura,wreck,they,1
7238036,II,feed,bran,1
7238037,II,range,building,1
7238038,II,hold,campaign,1
