# Background

Where old code goes after upgraded to `helper` status. To keep algo notebooks free of clutter

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import json

# Helper Functions

NOTE: these were saved into `data_handler.py` helper library

## Organize dataset

In [1]:
def organize_twitch_chat(data):
    # all vars were loaded as str. Change type to datetime/int/bool
    data['created_at'] = pd.to_datetime(data['created_at'])
    data['updated_at'] = pd.to_datetime(data['updated_at'])
    
    df = data[['created_at','updated_at','commenter','message']]
    
    messages = df['message'].apply(pd.Series).drop(['fragments','user_color','user_notice_params'],axis=1)
    users = df['commenter'].apply(pd.Series)
    
    df = df.drop(['message','commenter'], axis=1) # duplicate info
    df = pd.concat([df,users,messages],axis=1)
    df = df.iloc[:,[0,1,2,3,4,5,6,9,10,11,12,13]] # select cols that arent duplicates
    
    return df

## Split into hour sections

In [4]:
class dfSplitter():
    def __init__(self, dataframe):
        '''
        Splits dataframe into multiple dataframes, each 1 hour long

        output:
        ------
        my_list: list
            List of dataframes
        '''
        # init function finds the first split
        dataframe = dataframe.sort_values("created_at")
        first = dataframe[dataframe['created_at'] <= dataframe.loc[0,'created_at'] + pd.Timedelta(hours = 1)]
        self.last_i = first.index.max()
        self.dataframe = dataframe
        self.result = [] # list to append starting timestamp + datasets to
        self.result.append(dataframe.iloc[0, 0]) # NOTE: assumes first col is always "created_at" col
        self.result.append(first)
        
    def find_rest(self):
        '''
        Uses last index of first split to find the others
        '''
        dataframe = self.dataframe
        last_i = self.last_i
        if last_i+1 != len(dataframe):
            new_df = dataframe.loc[last_i+1:,:] # clip df to start at last_i
            newest = new_df[new_df['created_at'] <= new_df.loc[last_i+1,'created_at'] + pd.Timedelta(hours=1)] # filter by hour
            self.result.append(newest) # store in list
            self.last_i = newest.index.max()
            
            self.find_rest() # repeat
        else:
            return dataframe # never actually used

## Split each section into X minute chunks

In [5]:
class xminChats():
    def __init__(self,dataframe, big_unique, min_= 2):
        '''
        Finds the percent unique chatters that chatted every min_ minutes
        
        input
        -----
        dataframe: pd.DataFrame
            Twitch chat dataframe organized and split by dfSplitter
        big_unique: int
            Total number of unique chatters for the entire Twitch stream
        min_: int
            Minute range to find timestamps for. Ex: Find 2 min long timestamps.
        '''
        
        # init function finds the first split
        dataframe = dataframe.sort_values("created_at")
        first = dataframe[dataframe['created_at'] <= dataframe.iloc[0,0] + pd.Timedelta(minutes = min_)]
        
        self.min_ = min_
        self.total_uniques = len(dataframe['_id'].unique())
        self.big_unique = big_unique
        
        self.last_i = first.index.max()
        self.dataframe = dataframe
        
        self.result = []
        self.result.append(first)
        
    def find_rest(self):
        '''
        Uses last index of first split to find the others
        '''
        dataframe = self.dataframe
        last_i = self.last_i
        if last_i+1 < dataframe.index.max(): # NOT len(dataframe), that bugs out and i dont wanna explain why
            new_df = dataframe.loc[last_i+1:,:] # clip df to start new min_ min calc at last_i+1
            newest = new_df[new_df['created_at'] <= new_df.loc[last_i+1,'created_at'] + pd.Timedelta(value=self.min_, unit='minutes')] # filter by minute
            self.result.append(newest) # store in list
            
            self.last_i = newest.index.max()
            self.find_rest() # repeat
        else:
            x=''
    


## Format results as json

In [6]:
def results_jsonified(results, first_sec, results_col):
    '''
    Converts timestamps to seconds, extracts results and makes the whole thing machine readable
    
    input
    -----
    results: pd.DataFrame
        DataFrame with at least the start (datetime) and end (datetime) columns, and a column to sort by.
    first_sec: datetime
        The very first timestamp in the entire twitch chat log. Used to calculate elapsed time in seconds.        
    results_col: str
        Column to sort values by (ascending=False)
        
    output
    ------
    json_results: list
        List of dictionaries with startTime and endTime keys, sorted by best results at top
    '''
    results['first_sec'] = first_sec # to calculate elapsed time from first sec, in seconds
    results = results.sort_values(results_col, ascending=False) # so json format is returned with top result being the most relevant
    json_results = []
    for i, row in results.iterrows():
        og = row['first_sec']
        start = row['start']
        end = row['end']
        
        start_sec = dt.timedelta.total_seconds(start-og) # find difference between first sec and given timestamp, convert that to seconds
        end_sec = dt.timedelta.total_seconds(end-og)
        
        dict_ = {"startTime":start_sec,
                 "endTime":end_sec}
        json_results.append(dict_)
        
    return json_results

## Save as json

In [7]:
def save_json(json_results, name):
    '''
    Saves json_results in txt file.
    '''
    str_  = '['
    for dict_ in json_results:
        str_ += str(dict_) + ', \n '
    str_ += ']'
    
    with open(f"exports/{name}.json",'w') as f:
        f.write(str_)
    print(f"Saved to data/{name}.json")

# Old emoticon getter

Used to create the answer for unit test. Found a bug where means weren't being filtered with it.

Keeping for posterity

In [35]:
import pandas as pd
import json
from pillaralgos.helpers import data_handler as dh

data = json.load(open("data/sample_lg.json"))

vid_id = data[0]['content_id']
vid_id

big_df = dh.organize_twitch_chat(data)
print(big_df.iloc[-1,0] - big_df.iloc[0,0])

def emo_extractor(my_list):
    return [emo_dict['_id'] for emo_dict in my_list]


def loc_extractor(my_list):
    return [[emo_dict['begin'], emo_dict['end']] for emo_dict in my_list]

all_emos = []
def emo_saver(my_list):
    for emo in my_list:
        all_emos.append(emo)
    return 'saved'

emo_col = big_df['emoticons'].dropna().reset_index(drop=True)
emo_id_list = emo_col.apply(lambda my_list: emo_extractor(my_list))
emo_id_list.apply(lambda my_list: emo_saver(my_list))
all_emos = pd.Series(all_emos)

emo_loc = emo_col.apply(lambda my_list: loc_extractor(my_list))
emo_body = big_df[~big_df['emoticons'].isna()]['body']

emo_data = emo_loc.copy().reset_index()
emo_data['body'] = emo_body.reset_index(drop=True)
emo_data = emo_data.drop('index',axis=1)

emo_data['id'] = emo_id_list

emo_dict = {}

for i, row in emo_data.iterrows():
    for x in range(len(row['emoticons'])):
        loc = row['emoticons'][x] # grab location
        begin = loc[0]
        end = loc[1] + 1
        emoji_name = row['body'][begin:end] # extract emoji text
        emoji_id = row['id'][x]
        
        if emoji_id not in emo_dict.keys():
            emo_dict[emoji_id] = emoji_name

num_used = all_emos.value_counts()

num_used = num_used.reset_index()
num_used.columns = ['emoji_id', 'occurrance']

num_used['emoji_name'] = num_used['emoji_id'].map(emo_dict)

num_used['label'] = ''
top_emoticons = num_used[num_used['occurrance'] > num_used['occurrance'].mean()]
top_emoticons['vid_id'] = vid_id
top_emoticons = top_emoticons[['vid_id','emoji_id','occurrance','emoji_name','label']]

# top_emoticons.to_csv("data/top_emoticons.csv", index=False)

0 days 10:57:21.037000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_emoticons['vid_id'] = vid_id


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   vid_id      55 non-null     object
 1   emoji_id    55 non-null     object
 2   occurrance  55 non-null     int64 
 3   emoji_name  55 non-null     object
 4   label       55 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.6+ KB


In [37]:
top_emoticons.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   vid_id      55 non-null     object
 1   emoji_id    55 non-null     object
 2   occurrance  55 non-null     int64 
 3   emoji_name  55 non-null     object
 4   label       55 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.6+ KB


In [39]:
all(df == top_emoticons)

True