# Background

Created to optimize algo and helper codes for decreased execution time

In [9]:
import pandas as pd
import numpy as np
import json
from pillaralgos.helpers import data_handler as dh

In [10]:
from pillaralgos import algo1,algo2,algo3_0,algo3_5

In [11]:
data = json.load(open('data/big_data.json'))

In [13]:
df.shape

(103928, 11)

In [12]:
df = dh.organize_twitch_chat(data)

Timeit results as of `4/11/21 12:17am EST`

| algo1  | algo2        | algo3_0 | algo3_5 |
|--------|--------------|---------|---------|
|3.4 sec | 3 min 14 sec |39.4 sec | 28 sec  |

In [4]:
%%time

json1 = algo1.run(data)

CPU times: user 3.36 s, sys: 51.6 ms, total: 3.41 s
Wall time: 3.41 s


In [8]:
%%time

json2 = algo2.run(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hour_list[i]['hour'] = i


<class 'pandas.core.frame.DataFrame'>
Int64Index: 376 entries, 0 to 375
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   created_at   376 non-null    datetime64[ns]
 1   updated_at   376 non-null    datetime64[ns]
 2   _id          376 non-null    int64         
 3   type         376 non-null    category      
 4   body         376 non-null    object        
 5   bits_spent   0 non-null      float64       
 6   emoticons    20 non-null     object        
 7   fragments    376 non-null    object        
 8   is_action    376 non-null    bool          
 9   user_badges  229 non-null    object        
 10  user_color   320 non-null    object        
 11  hour         376 non-null    int64         
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(2), object(5)
memory usage: 33.2+ KB
None
CPU times: user 3min 14s, sys: 216 ms, total: 3min 15s
Wall time: 3min 14s


In [6]:
%%time

json3_0 = algo3_0.run(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['hour'] = i
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['chunk'] = x


CPU times: user 39.4 s, sys: 62.9 ms, total: 39.5 s
Wall time: 39.4 s


In [7]:
%%time

json3_5 = algo3_5.run(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['hour'] = i
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['chunk'] = x


CPU times: user 27.1 s, sys: 893 ms, total: 28 s
Wall time: 28 s


# organize_twitch_chat()

In [9]:
class dictExtractor:
    def __init__(self, my_series, label = ''):
        '''
        Extracts dictionaries from series into a new dict using the
        longest dictionary's keys. Converts new dict into df, stored
        as `self.result`.
        
        input
        -----
        my_series: pd.Series
            A column from twitch dataframe where each row is a dict
        label: str
            What will be appended to the end of each col
        '''
        # find max length of dicts
        length = my_series.apply(lambda x: len(x))
        y = 0
        for x in length:
            if x > y:
                y = x
        # find index of max keys dict
        ind = length[length == 8].index[0]
        max_d = my_series.iloc[ind].keys()
        self.max_d = max_d
        # initiate new dict
        self.new_dict = {}
        for k in max_d:
            self.new_dict[k] = []
        # extract dict values into new dict
        my_series.apply(lambda x: self.keys_iterator(x))
        # store as df
        self.result = pd.DataFrame.from_dict(self.new_dict)
        # df.add_suffix() is actually 0.25 seconds slower
        self.result.columns = [col + label for col in self.result.columns]
        
    def keys_iterator(self, my_dict):
        '''
        Checks that all of the `max_d` are in the given dictionary. If not,
        appends np.nan. Otherwise appends the value.
        '''
        for k in self.max_d:
            if k not in my_dict.keys():
                self.new_dict[k].append(np.nan)
            else:
                self.new_dict[k].append(my_dict[k])

In [10]:
def organize_twitch_chat(data):
    '''
    Turns json into dataframe. Expands lists of lists into own columns. Selects only relevant columns.
    
    input
    -----
    data: list
        list of dictionaries in json format, loaded with the `open` context manager.
        
    output
    ------
    df: pd.DataFrame
        Dataframe with the following columns: 
            ['created_at', 'updated_at', 'display_name', '_id', 'name', 'type',
             'bio', 'logo', 'body', 'is_action', 'user_badges', 'emoticons']
    '''
    data = pd.DataFrame.from_records(data) # convert to df
    df = data[['created_at','updated_at','commenter','message']].add_suffix('_mess')
    
    h = dictExtractor(df['message_mess'],label='_mess')
    messages = h.result
    g = dictExtractor(df['commenter_mess'], label='_id')
    users = g.result
    
    df = df.drop(['message_mess','commenter_mess'], axis=1) # duplicate info
    df = pd.concat([df,users,messages],axis=1)
    
    cols = list(df.columns)
    cols.sort()
    
    bad_cols = [
                'display_name_id',
                'logo_id',
                'name_id',
                'user_badges_mess',
                'user_color_mess',
                'user_notice_params_mess'
            ]
    df = df[cols]
    df = df.drop(bad_cols, axis=1)
    # all vars were loaded as str. Change type to datetime/int/bool
    df = df.astype( {'_id_id':int,
                    'bio_id':'category',
                    'created_at_id':'datetime64[ns]',
                    'created_at_mess':'datetime64[ns]',
                    'updated_at_id':'datetime64[ns]',
                    'updated_at_mess':'datetime64[ns]',
                    'is_action_mess':bool,
                    'type_id':'category'}
                  )    
    return df