In [1]:
import configparser
from functools import reduce
import os
import sys

import pandas as pd

In [2]:
import logging
logging.basicConfig(level=logging.DEBUG, 
                    format='%(levelname)-8s [%(lineno)d] %(message)s')

In [4]:
config = configparser.ConfigParser()
config.read('config.ini')
df_dir = config['DEFAULT']['ProjectDirectory']+'dataframe/'

In [6]:
def read_series_or_df(filen):
    data = pd.read_pickle(filen)
    if isinstance(data, pd.Series):
        return data.to_frame(name='freq')
    elif isinstance(data, pd.DataFrame):
        return data.reset_index().set_index(['nsubj', 'ROOT', 'dobj'])
    else:
        raise NotImplementedError

In [27]:
def sum_tenth(i=0):
    """
    13 min
    """
    df = None
    for j in range(10):
        logging.info(f'Adding freq{j}{i}x')
        for k in range(10):
            filen = os.path.join(df_dir, f'freq{k}{j}{i}.pkl')
            if not os.path.isfile(filen):
                logging.warning('File {} not exists.'.format(filen))
                continue
            df0 = read_series_or_df(filen)
            if df is None:
                df = df0
            else:
                df += df0.reindex(df.index, fill_value=0)
    logging.info('Pickling dataframe..')
    df = df.astype(int)
    df = df.sort_values('freq', ascending=False)
    df.to_pickle(os.path.join(df_dir, f'freq{i}.pkl'))
    return df

In [31]:
def sum_half():
    df = None
    for i in range(10):
        logging.info(f'Adding freq{i}')
        filen = os.path.join(df_dir, f'freq{i}.pkl')
        if not os.path.isfile(filen):
            logging.warning('File {} not exists.'.format(filen))
            continue
        df0 = read_series_or_df(filen)
        if df is None:
            df = df0
        else:
            df += df0.reindex(df.index, fill_value=0)
    logging.info('Pickling dataframe..')
    df = df.astype(int)
    df = df.sort_values('freq', ascending=False)
    df.to_pickle(os.path.join(df_dir, f'freq.pkl'))
    return df

In [32]:
sum_half()

INFO     [4] Adding freq0
INFO     [4] Adding freq1
INFO     [4] Adding freq2
INFO     [4] Adding freq3
INFO     [4] Adding freq4
INFO     [4] Adding freq5
INFO     [4] Adding freq6
INFO     [4] Adding freq7
INFO     [4] Adding freq8
INFO     [4] Adding freq9
INFO     [14] Pickling dataframe..


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,freq
nsubj,ROOT,dobj,Unnamed: 3_level_1
I,think,,9398750
it,be,,7408077
I,know,,4475070
that,be,,4339360
I,have,,3794298
...,...,...,...
investor,borrow,fund,1
Phimmasone,broker,,1
investor,penalise,firm,1
Phins,able,,1


In [5]:
def sum_frames(common_suff):
    filen = os.path.join(df_dir, 'freq0{}.pkl').format(common_suff)
    df = read_series_or_df(filen)
    for i in range(1, 10):
        filen = os.path.join(df_dir, 'freq{}{}.pkl').format(i, common_suff)
        if not os.path.isfile(filen):
            logging.warning('File {} not exists.'.format(filen))
            continue
        df0 = read_series_or_df(filen)
        df += df0.reindex(df.index, fill_value=0)
    logging.info('Pickling dataframe..')
    df = df.astype(int)
    df = df.sort_values('freq', ascending=False)
    df.to_pickle(os.path.join(df_dir, 'freq{}.pkl').format(common_suff))
    return df

In [15]:
def sum_frames_temp():
    """
    Creating freq0 takes 10 min.
    """
    filen = os.path.join(df_dir, 'freq000.pkl')
    df = read_series_or_df(filen)
    for i in range(1, 10):
        logging.info(f'Reading {i}0x..')

        for j in range(1, 10):
            filen = os.path.join(df_dir, f'freq{i}0{j}.pkl')
            if not os.path.isfile(filen):
                logging.warning('File {} not exists.'.format(filen))
                continue
            df0 = read_series_or_df(filen)
            df += df0.reindex(df.index, fill_value=0)
    logging.info('Pickling dataframe..')
    df = df.astype(int)
    df = df.sort_values('freq', ascending=False)
    df.to_pickle(os.path.join(df_dir, 'freqx0x.pkl'))
    return df

In [17]:
df.reset_index(inplace=True)
df[(df.dobj!='')&(df.nsubj!='')][['nsubj', 'ROOT', 'dobj', 'freq']]

Unnamed: 0,nsubj,ROOT,dobj,freq
175,it,be,what,391429
178,I,tell,you,389719
184,I,do,what,379494
201,you,do,what,357092
252,you,have,question,288926
...,...,...,...,...
7238035,Asura,wreck,they,1
7238036,II,feed,bran,1
7238037,II,range,building,1
7238038,II,hold,campaign,1
