# Background

Created to optimize algo and helper codes for decreased execution time

In [1]:
import pandas as pd
import numpy as np
import json
from pillaralgos.helpers import data_handler as dh

In [2]:
from pillaralgos import algo1,algo2,algo3_0,algo3_5,brain

In [3]:
data = json.load(open('data/lg_962598044.json'))

Timeit results as of `April 13, 2021 18:31 EST` for file `962598044.json`

| algo1   | algo2        | algo3_0 | algo3_5 | brain |
|---------|--------------|---------|---------|-------|
|1.59 sec | 1 min 6 sec  |22.0 sec | 11.6 sec| xx    |

In [4]:
%%timeit

json1 = algo1.run(data)

2.33 s ± 29.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%%timeit

json2 = algo2.run(data)

1min 28s ± 3.11 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%%timeit

json3_0 = algo3_0.run(data)

30 s ± 340 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit

json3_5 = algo3_5.run(data)

17.8 s ± 456 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%%timeit

brain_r = brain.run(data, clip_length=2)

2min 20s ± 3.13 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


# organize_twitch_chat()

In [9]:
class dictExtractor:
    def __init__(self, my_series, label = ''):
        '''
        Extracts dictionaries from series into a new dict using the
        longest dictionary's keys. Converts new dict into df, stored
        as `self.result`.
        
        input
        -----
        my_series: pd.Series
            A column from twitch dataframe where each row is a dict
        label: str
            What will be appended to the end of each col
        '''
        # find max length of dicts
        length = my_series.apply(lambda x: len(x))
        y = 0
        for x in length:
            if x > y:
                y = x
        # find index of max keys dict
        ind = length[length == 8].index[0]
        max_d = my_series.iloc[ind].keys()
        self.max_d = max_d
        # initiate new dict
        self.new_dict = {}
        for k in max_d:
            self.new_dict[k] = []
        # extract dict values into new dict
        my_series.apply(lambda x: self.keys_iterator(x))
        # store as df
        self.result = pd.DataFrame.from_dict(self.new_dict)
        # df.add_suffix() is actually 0.25 seconds slower
        self.result.columns = [col + label for col in self.result.columns]
        
    def keys_iterator(self, my_dict):
        '''
        Checks that all of the `max_d` are in the given dictionary. If not,
        appends np.nan. Otherwise appends the value.
        '''
        for k in self.max_d:
            if k not in my_dict.keys():
                self.new_dict[k].append(np.nan)
            else:
                self.new_dict[k].append(my_dict[k])

In [10]:
def organize_twitch_chat(data):
    '''
    Turns json into dataframe. Expands lists of lists into own columns. Selects only relevant columns.
    
    input
    -----
    data: list
        list of dictionaries in json format, loaded with the `open` context manager.
        
    output
    ------
    df: pd.DataFrame
        Dataframe with the following columns: 
            ['created_at', 'updated_at', 'display_name', '_id', 'name', 'type',
             'bio', 'logo', 'body', 'is_action', 'user_badges', 'emoticons']
    '''
    data = pd.DataFrame.from_records(data) # convert to df
    df = data[['created_at','updated_at','commenter','message']].add_suffix('_mess')
    
    h = dictExtractor(df['message_mess'],label='_mess')
    messages = h.result
    g = dictExtractor(df['commenter_mess'], label='_id')
    users = g.result
    
    df = df.drop(['message_mess','commenter_mess'], axis=1) # duplicate info
    df = pd.concat([df,users,messages],axis=1)
    
    cols = list(df.columns)
    cols.sort()
    
    bad_cols = [
                'display_name_id',
                'logo_id',
                'name_id',
                'user_badges_mess',
                'user_color_mess',
                'user_notice_params_mess'
            ]
    df = df[cols]
    df = df.drop(bad_cols, axis=1)
    # all vars were loaded as str. Change type to datetime/int/bool
    df = df.astype( {'_id_id':int,
                    'bio_id':'category',
                    'created_at_id':'datetime64[ns]',
                    'created_at_mess':'datetime64[ns]',
                    'updated_at_id':'datetime64[ns]',
                    'updated_at_mess':'datetime64[ns]',
                    'is_action_mess':bool,
                    'type_id':'category'}
                  )    
    return df