In [17]:
# analyse the most frequent emojis in the training dataset

import string
import os
from collections import defaultdict

# List of file names
files = ["../twitter-datasets/train_neg_full.txt", "../twitter-datasets/train_pos_full.txt"]  # Update with your filenames

# Create an empty dictionary to hold counts
token_counts = defaultdict(int)

# Define the special characters
special_chars = ":;<-\\^-=0*+"

# Iterate over the files
for filename in files:
    
    with open(filename, "r") as file:
        # Read the file
        data = file.read()
        
        # Tokenize the text file content by splitting at space
        tokens = data.split()
        
        # Iterate over tokens
        for token in tokens:
            # Check if the length of the token is between 2 and 5
            # and if it contains any special character
            if 2 <= len(token) <= 5 and any(token[0] == char for char in special_chars):
                # Increment the count of the token
                token_counts[token] += 1

# Print the dictionary
print(f'total token amount: {len(token_counts)}')

# Print the results and save the most frequent tokens into an array
emoji_list = []
for token, count in sorted(token_counts.items(), key=lambda x: x[1], reverse=True):
    if count > 100:
        print(f"{token.upper()}  ->  {count}")
        emoji_list.append(token.upper())
print(emoji_list)


total token amount: 3077
<URL>  ->  526859
<3  ->  57215
:D  ->  24443
:P  ->  13333
:/  ->  9655
:')  ->  3982
=)  ->  2595
;D  ->  1925
<---  ->  1768
--->  ->  1674
:|  ->  1504
;P  ->  1089
-->  ->  1022
->  ->  992
::  ->  739
<--  ->  666
:-D  ->  490
01  ->  449
00  ->  447
:\  ->  410
:-P  ->  406
02  ->  391
:]  ->  386
=D  ->  375
:'D  ->  358
:-/  ->  350
0.5  ->  338
<-  ->  336
000  ->  297
:@  ->  259
07  ->  238
09  ->  231
05  ->  225
03  ->  222
=(  ->  220
04  ->  220
08  ->  218
06  ->  206
;/  ->  176
0.0  ->  173
=P  ->  158
0BK  ->  156
=/  ->  153
001  ->  152
=]  ->  134
0MAH  ->  114
:}  ->  111
['<URL>', '<3', ':D', ':P', ':/', ":')", '=)', ';D', '<---', '--->', ':|', ';P', '-->', '->', '::', '<--', ':-D', '01', '00', ':\\', ':-P', '02', ':]', '=D', ":'D", ':-/', '0.5', '<-', '000', ':@', '07', '09', '05', '03', '=(', '04', '08', '06', ';/', '0.0', '=P', '0BK', '=/', '001', '=]', '0MAH', ':}']


In [19]:
# manuelly delete the tokens in the emoji_list that do not make sense as emojis, e.g. "<URL>", "00", "01"
# map the emojis to very simple English adjectives
# create a key: value dictionary

emoji_dict = {
    '<3': 'Lovely',
    ':D': 'Happy',
    ':P': 'Playful',
    ':/': 'Unsure',
    ":')": 'Heartwarming',
    '=)': 'Content',
    ';D': 'Cheeky',
    ':|': 'Neutral',
    ';P': 'Teasing',
    ':-D': 'Excited',
    ':\\': 'Annoyed',
    ':-P': 'Joking',
    ':]': 'Happy',
    '=D': 'Excited',
    ":'D": 'Delighted',
    ':-/': 'Puzzled',
    '=(': 'Sad',
    ';/': 'Disappointed',
    '0.0': 'Surprised',
    '=P': 'Amused',
    '=/': 'Uneasy',
    '=]': 'Optimistic',
    ':}': 'Smug'
}

# method for replacing the emojis with adjectives
def replace_emoji(dictionary, text):
    for key, value in dictionary.items():
        text = text.replace(key.lower(), value.lower())
    return text

def get_file_without_emoji(filename):
    output_filename = filename.replace(".txt", "_without_emoji.txt")        
    
    with open(filename, 'r') as input_file:
        lines = input_file.readlines()
    
    modified_lines = []
    for line in lines:
        line_without_emoji = replace_emoji(emoji_dict, line)
        modified_lines.append(line_without_emoji)
    
    with open(output_filename, 'w') as output_file:
        output_file.writelines(modified_lines)
    
    print(f"New file without emoji saved as {output_filename}")

files = ["../twitter-datasets/train_neg_full.txt", "../twitter-datasets/train_pos_full.txt"]  # Update with your filenames

for filename in files:
    get_file_without_emoji(filename)


New file without emoji saved as ../twitter-datasets/train_neg_full_without_emoji.txt
New file without emoji saved as ../twitter-datasets/train_pos_full_without_emoji.txt
