In [1]:
import os
import warnings
warnings.filterwarnings('ignore')


import emoji
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=os.cpu_count())

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
df = pd.read_csv("../datasets/raw_data.csv")

## Preliminary Exploration

In [3]:
df.shape

(316571, 1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316571 entries, 0 to 316570
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tweet   316571 non-null  object
dtypes: object(1)
memory usage: 2.4+ MB


In [5]:
df.drop_duplicates(['tweet'], inplace=True)
df.shape

(299432, 1)

## Data Preparation

#### Extract emojis from each tweet

In [6]:
RE_EMOJI = emoji.get_emoji_regexp()

In [7]:
def extract_and_remove_emoji(row: pd.Series) -> pd.Series:
    tweet = row['tweet']
    emojis = set(RE_EMOJI.findall(tweet))
    tweet = RE_EMOJI.sub(' ', tweet)

    row['emojis'] = emojis
    row['tweet'] = tweet

    return row

In [8]:
df['emojis'] = np.nan
%time df = df.parallel_apply(extract_and_remove_emoji, axis=1)

CPU times: user 1.42 s, sys: 947 ms, total: 2.37 s
Wall time: 3min 51s


#### Filter only tweets which contain a single emoji character (not necessery to occur once)

In [9]:
filtered = df[df['emojis'].parallel_apply(lambda emojis: len(emojis) == 1)]
filtered['emoji'] = filtered['emojis'].parallel_apply(lambda emojis: emojis.pop())
filtered.drop(columns=['emojis'], inplace=True)

#### Show top 20 most infrequently used emoji

In [10]:
filtered['emoji'].value_counts(ascending=True)[:20]

🏈       13
🎾       18
🍴       50
🦠       84
🏀      101
⚽      139
🤑      152
🚬      203
🍺      245
💩      301
🤓      523
⭐      524
📸      535
🍻      584
🇹🇭     594
💯      632
👻      655
🥵      932
🤢     1073
💪     1614
Name: emoji, dtype: int64

#### Group 🍺 and 🍻  into a single label (It can be considered to a same label)

In [11]:
mask = filtered['emoji'] == '🍺'
filtered.loc[mask, 'emoji'] = '🍻'

In [12]:
filtered[filtered['emoji'] == '🍻'].shape

(829, 2)

#### Filter out any emoji that occurs less than 500 times

In [13]:
tweets = filtered.groupby('emoji').filter(lambda freq: len(freq) >= 500)

#### Save the prepared dataframe to CSV

In [16]:
tweets.to_csv("../datasets/prepared_data.csv", index=False)