# New emoticon getter

In [26]:
import pandas as pd
import json
from pillaralgos.helpers import data_handler as dh

data = json.load(open('data/sample_lg.json'))

class emoticonExtractor:
    def __init__(self, data, min_use='mean', limit=None):
        """
        Gets data ready for emo extraction. Initializes dicts, lists, etc.
        
        input
        -----
        data: list
            List of dictionaries, a json file opened with json.load(open(file))
        min_use: str, int, None
            'mean': Return only those emoticons who's count is > the mean occurrance
            int: Return only those emoticons who's count is > X. Use 0 to not filter.
        limit: int, None
            int: Return only the top X emoticons (using df.head(X))
            None: Return all emoticons
        """

        big_df = dh.organize_twitch_chat(data)
        print(big_df.iloc[-1, 0] - big_df.iloc[0, 0])
        self.vid_id = data[0]["content_id"]
        self.big_df = big_df
        self.all_emos = (
            []
        )  # all unique emo_ids, later used to pd.Series(self.all_emos).value_counts()
        self.limit = limit
        self.min_use = min_use
        
    def run(self):
        """
        Coordinates all functions to return a dataset of top emojis used
        """
        import pandas as pd

        body_has_emo = self.big_df[~self.big_df["emoticons"].isna()].reset_index(
            drop=True
        )

        body_has_emo["emo_id_list"] = body_has_emo["emoticons"].apply(
            lambda my_list: self.emo_extractor(my_list)
        )

        body_has_emo["emo_id_list"].apply(lambda my_list: self.emo_saver(my_list))
        body_has_emo["emo_loc"] = body_has_emo["emoticons"].apply(
            lambda my_list: self.loc_extractor(my_list)
        )

        emo_data = body_has_emo[["emo_id_list", "emo_loc", "body"]]
        self.emo_data = emo_data

        num_used = pd.Series(
            self.all_emos
        ).value_counts()  # count how many times each unique emo was used

        num_used = num_used.reset_index()  # turn series to df, rename cols
        num_used.columns = ["emoji_id", "occurrance"]

        emo_dict = self.emo_id_matcher(
            emo_data
        )  # create a dictionary of emo_id:emo_name
        num_used["emoji_name"] = num_used["emoji_id"].map(emo_dict)
        num_used["label"] = ""

        if type(self.min_use) == str:
            # grab everything greater than mean count
            top_emoticons = num_used[num_used["occurrance"] > num_used["occurrance"].mean()]
        elif type(self.min_use) == int:
            # grab everything greater than X
            top_emoticons = num_used[num_used["occurrance"] > self.min_use]
        else:
            # otherwise return all results
            top_emoticons = num_used
            
        if type(self.limit) == int:
            # grab only the top X most used
            top_emoticons = top_emoticons.head(limit)
        else:
            # return all results
            top_emoticons = top_emoticons
            
        top_emoticons["vid_id"] = self.vid_id
        # reorganize columns
        top_emoticons = top_emoticons[
            ["vid_id", "emoji_id", "occurrance", "emoji_name", "label"]
        ]

        return top_emoticons

    def emo_extractor(self, my_list):
        """
        Helper function to grab emoticon id
        """
        return [emo_dict["_id"] for emo_dict in my_list]

    def loc_extractor(self, my_list):
        """
        Helper function to grab index location of emoticon in the body
        """
        return [[emo_dict["begin"], emo_dict["end"]] for emo_dict in my_list]

    def emo_saver(self, my_list):
        """
        Helper function to extract all emo_ids from the list and append to self.all_emos
        """
        for emo in my_list:
            self.all_emos.append(emo)

    def emo_id_matcher(self, emo_data):
        """
        Matches the emoticon id to it's twitch-defined emoticon text
        """
        emo_dict = {}
        for i, row in emo_data.iterrows():
            for x in range(len(row["emo_loc"])):
                loc = row["emo_loc"][x]  # grab location
                begin = loc[0]
                end = loc[1] + 1
                emoji_name = row["body"][begin:end]  # extract emoji text
                emoji_id = row["emo_id_list"][x]

                if emoji_id not in emo_dict.keys():
                    emo_dict[emoji_id] = emoji_name
        return emo_dict

In [27]:
ee = emoticonExtractor(data)

0 days 10:57:21.037000


In [28]:
df = ee.run()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_emoticons["vid_id"] = self.vid_id


# Old emoticon getter

In [35]:
import pandas as pd
import json
from pillaralgos.helpers import data_handler as dh

data = json.load(open("data/sample_lg.json"))

vid_id = data[0]['content_id']
vid_id

big_df = dh.organize_twitch_chat(data)
print(big_df.iloc[-1,0] - big_df.iloc[0,0])

def emo_extractor(my_list):
    return [emo_dict['_id'] for emo_dict in my_list]


def loc_extractor(my_list):
    return [[emo_dict['begin'], emo_dict['end']] for emo_dict in my_list]

all_emos = []
def emo_saver(my_list):
    for emo in my_list:
        all_emos.append(emo)
    return 'saved'

emo_col = big_df['emoticons'].dropna().reset_index(drop=True)
emo_id_list = emo_col.apply(lambda my_list: emo_extractor(my_list))
emo_id_list.apply(lambda my_list: emo_saver(my_list))
all_emos = pd.Series(all_emos)

emo_loc = emo_col.apply(lambda my_list: loc_extractor(my_list))
emo_body = big_df[~big_df['emoticons'].isna()]['body']

emo_data = emo_loc.copy().reset_index()
emo_data['body'] = emo_body.reset_index(drop=True)
emo_data = emo_data.drop('index',axis=1)

emo_data['id'] = emo_id_list

emo_dict = {}

for i, row in emo_data.iterrows():
    for x in range(len(row['emoticons'])):
        loc = row['emoticons'][x] # grab location
        begin = loc[0]
        end = loc[1] + 1
        emoji_name = row['body'][begin:end] # extract emoji text
        emoji_id = row['id'][x]
        
        if emoji_id not in emo_dict.keys():
            emo_dict[emoji_id] = emoji_name

num_used = all_emos.value_counts()

num_used = num_used.reset_index()
num_used.columns = ['emoji_id', 'occurrance']

num_used['emoji_name'] = num_used['emoji_id'].map(emo_dict)

num_used['label'] = ''
top_emoticons = num_used[num_used['occurrance'] > num_used['occurrance'].mean()]
top_emoticons['vid_id'] = vid_id
top_emoticons = top_emoticons[['vid_id','emoji_id','occurrance','emoji_name','label']]

# top_emoticons.to_csv("data/top_emoticons.csv", index=False)

0 days 10:57:21.037000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_emoticons['vid_id'] = vid_id


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   vid_id      55 non-null     object
 1   emoji_id    55 non-null     object
 2   occurrance  55 non-null     int64 
 3   emoji_name  55 non-null     object
 4   label       55 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.6+ KB


In [37]:
top_emoticons.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   vid_id      55 non-null     object
 1   emoji_id    55 non-null     object
 2   occurrance  55 non-null     int64 
 3   emoji_name  55 non-null     object
 4   label       55 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.6+ KB


In [39]:
all(df == top_emoticons)

True

# Description

Notebook to explore data, recreate bugs, solve bugs

In [1]:
from pillaralgos import algo2
import pillaralgos_dev as p_dev
from pillaralgos_dev import dev_helpers as dev_help
import json

In [2]:
buggy_file = '981828174'
def bug_handler(buggy_file, algo):
    '''
    Simple function to run reported file and files in data folder through a function
    '''
    aws = p_dev.awsBucketAPI()
    aws.save_specific_file(buggy_file)
    data = json.load(open(f'data/{buggy_file}.json'))
    try:
        json_result = algo.run(data)
    except Exception as e:
        print("FAILED")
        print(e)

    import os
    files = os.listdir('data/')
    new_files = [f for f in files if '.json' in f and 'big_data' not in f]

    algoworked = []
    algofailed = []
    results = {}
    for file in new_files:
        try:
            data = json.load(open(f'data/{file}'))
            results[file] = algo.run(data)
            algoworked.append(file)
        except:
            algofailed.append(file)
    return algoworked, algofailed, results

# Unit Test creations

In [1]:
import json
import pandas as pd

In [3]:
data = json.load(open("pypi/prod/test/sample_data/sample_med.json"))

In [6]:
from pillaralgos.helpers import data_handler as dh

In [39]:
from pillaralgos import algo1, algo2, algo3_0, algo3_5

In [37]:
json_re1 = algo1.run(data,min_=0.5, limit=10, sort_by='rel',save_json=False)

In [40]:
json_re2 = algo2.run(data,min_=0.5, limit=10,save_json=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk["hour"] = i
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk["chunk"] = x


In [26]:
import pandas as pd
import json
from pillaralgos.helpers import data_handler as dh

data = json.load(open('data/sample_lg.json'))

class emoticonExtractor:
    def __init__(self, data, min_use='mean', limit=None):
        """
        Gets data ready for emo extraction. Initializes dicts, lists, etc.
        
        input
        -----
        data: list
            List of dictionaries, a json file opened with json.load(open(file))
        min_use: str, int, None
            'mean': Return only those emoticons who's count is > the mean occurrance
            int: Return only those emoticons who's count is > X. Use 0 to not filter.
        limit: int, None
            int: Return only the top X emoticons (using df.head(X))
            None: Return all emoticons
        """

        big_df = dh.organize_twitch_chat(data)
        print(big_df.iloc[-1, 0] - big_df.iloc[0, 0])
        self.vid_id = data[0]["content_id"]
        self.big_df = big_df
        self.all_emos = (
            []
        )  # all unique emo_ids, later used to pd.Series(self.all_emos).value_counts()
        self.limit = limit
        self.min_use = min_use
        
    def run(self):
        """
        Coordinates all functions to return a dataset of top emojis used
        """
        import pandas as pd

        body_has_emo = self.big_df[~self.big_df["emoticons"].isna()].reset_index(
            drop=True
        )

        body_has_emo["emo_id_list"] = body_has_emo["emoticons"].apply(
            lambda my_list: self.emo_extractor(my_list)
        )

        body_has_emo["emo_id_list"].apply(lambda my_list: self.emo_saver(my_list))
        body_has_emo["emo_loc"] = body_has_emo["emoticons"].apply(
            lambda my_list: self.loc_extractor(my_list)
        )

        emo_data = body_has_emo[["emo_id_list", "emo_loc", "body"]]
        self.emo_data = emo_data

        num_used = pd.Series(
            self.all_emos
        ).value_counts()  # count how many times each unique emo was used

        num_used = num_used.reset_index()  # turn series to df, rename cols
        num_used.columns = ["emoji_id", "occurrance"]

        emo_dict = self.emo_id_matcher(
            emo_data
        )  # create a dictionary of emo_id:emo_name
        num_used["emoji_name"] = num_used["emoji_id"].map(emo_dict)
        num_used["label"] = ""

        if type(self.min_use) == str:
            # grab everything greater than mean count
            top_emoticons = num_used[num_used["occurrance"] > num_used["occurrance"].mean()]
        elif type(self.min_use) == int:
            # grab everything greater than X
            top_emoticons = num_used[num_used["occurrance"] > self.min_use]
        else:
            # otherwise return all results
            top_emoticons = num_used
            
        if type(self.limit) == int:
            # grab only the top X most used
            top_emoticons = top_emoticons.head(limit)
        else:
            # return all results
            top_emoticons = top_emoticons
            
        top_emoticons["vid_id"] = self.vid_id
        # reorganize columns
        top_emoticons = top_emoticons[
            ["vid_id", "emoji_id", "occurrance", "emoji_name", "label"]
        ]

        return top_emoticons

    def emo_extractor(self, my_list):
        """
        Helper function to grab emoticon id
        """
        return [emo_dict["_id"] for emo_dict in my_list]

    def loc_extractor(self, my_list):
        """
        Helper function to grab index location of emoticon in the body
        """
        return [[emo_dict["begin"], emo_dict["end"]] for emo_dict in my_list]

    def emo_saver(self, my_list):
        """
        Helper function to extract all emo_ids from the list and append to self.all_emos
        """
        for emo in my_list:
            self.all_emos.append(emo)

    def emo_id_matcher(self, emo_data):
        """
        Matches the emoticon id to it's twitch-defined emoticon text
        """
        emo_dict = {}
        for i, row in emo_data.iterrows():
            for x in range(len(row["emo_loc"])):
                loc = row["emo_loc"][x]  # grab location
                begin = loc[0]
                end = loc[1] + 1
                emoji_name = row["body"][begin:end]  # extract emoji text
                emoji_id = row["emo_id_list"][x]

                if emoji_id not in emo_dict.keys():
                    emo_dict[emoji_id] = emoji_name
        return emo_dict

In [45]:
json_re3 = algo3_5.run(data,min_=0.5, limit=10, goal='num_emo',save_json=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk["hour"] = i
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk["chunk"] = x


In [46]:
json_re3

[{'startTime': 0.0, 'endTime': 119.562},
 {'startTime': 765.512, 'endTime': 882.645},
 {'startTime': 891.677, 'endTime': 1008.99},
 {'startTime': 259.889, 'endTime': 375.413},
 {'startTime': 127.098, 'endTime': 234.249},
 {'startTime': 383.298, 'endTime': 495.599},
 {'startTime': 630.41, 'endTime': 741.233},
 {'startTime': 1018.379, 'endTime': 1131.467},
 {'startTime': 507.003, 'endTime': 626.884},
 {'startTime': 1138.629, 'endTime': 1256.19}]