**DATA PREPROCESSING**

Import libraries

In [1]:
import pandas as pd
import re
import emoji
from emoji import EMOJI_DATA 
import numpy as np
import langdetect
from collections import Counter

Import csv

In [2]:
# Define function to read csv file into dataset
def read_csv_file(file): 
    dataset = pd.read_csv(file,header=0)
    return dataset
    
df = read_csv_file(r"C:\Users\nayma\AppData\Local\Programs\Python\Python312\Scripts\thesis\total.csv")

# Check the dataset
print(len(df.index), "records in the dataset")
df.head()

61065 records in the dataset


Unnamed: 0,post_id,user_id,publish_date,comment_id,comment_text
0,A1,256990849,"2/6/2024, 3:58:30 PM",10001,🍸firsts
1,A1,256990849,"2/6/2024, 3:58:38 PM",10002,lbcb❤️‍🩹
2,A1,261362701,"2/6/2024, 4:07:48 PM",10003,Cb ❤️
3,A1,48983127189,"2/6/2024, 6:07:41 PM",10004,Nice
4,A1,260048784,"2/7/2024, 3:28:33 AM",10005,LBLB


Work only with relevant columns and make comment text column a string

In [3]:
df = df[['post_id','comment_id','comment_text']]
df['comment_text'] = df['comment_text'].astype(str)

**Data cleaning**

Keep only words uppercased or lowered, i.e. lower capitalized

In [4]:
# Define function lower some words
def lower_words(column):
    lowered = []
    pattern = r'\b(?![a-z]+\b|[A-Z]+\b)[a-zA-Z]+'
    for comment in column:
        for word in re.findall(pattern, comment):
            comment = comment.replace(word, word.lower())
        lowered.append(comment)
    return lowered
    
df['comment_text'] = lower_words(df['comment_text'])

Add spaces between emojis

In [5]:
# Search your emoji
def is_emoji(s):
    return s in EMOJI_DATA

# Add space around emoji
def add_space(column):
    column_spaced = []
    for comment in column:
        comment = ''.join(' ' + char + ' ' if is_emoji(char) else char for char in comment).strip()
        column_spaced.append(comment)
    return column_spaced

df['comment_text'] = add_space(df['comment_text'])

Remove mentions, URLs, hash symbol

In [6]:
# Define function @
def remove_mention(column):
    without_add = []
    for comment in column:
        comment = re.sub(r"(?:\@|@)\S+", "",comment) 
        without_add.append(comment)
    return without_add


# Define function URL
def remove_url(column):
    without_url = []
    for comment in column:
        comment = re.sub(r"(?:\@|https?\://)\S+","",comment)  
        without_url.append(comment)
    return without_url


# Define function #
def remove_hashtag(column):
    without_hash = []
    for comment in column:
        comment = re.sub(r"#", "",comment) 
        without_hash.append(comment)
    return without_hash


# Mentions: removing @mention
df['comment_text'] = remove_mention(df['comment_text'])

# URLs: removing http://urlwhatever.com
df['comment_text'] = remove_url(df['comment_text'])

# Hashtag symbol: #
df['comment_text'] = remove_hashtag(df['comment_text'])

Remove leading and trailing blank spaces in string and removing extra spaces

In [7]:
# Define function remove blank spaces
def blank_spaces(column):
    without_space = []
    for comment in column:
        comment = comment.replace(r' {2,}',' ')
        comment = comment.strip()
        without_space.append(comment)
    return without_space
    
df['comment_text'] = blank_spaces(df['comment_text'])

Missing comments

In [8]:
list_drop = ['.','nan','NAN','']

for i in list_drop:
    df = df.drop(df[df['comment_text'] == i].index)

Most repeated comments

In [9]:
df.comment_text.value_counts().head(50)

comment_text
❤ ️                            1447
😍                              1067
😍  😍  😍                        1009
❤ ️ ❤ ️ ❤ ️                     810
🔥  🔥  🔥                         777
🔥                               775
😍  😍                            738
cb                              626
❤ ️ ❤ ️                         533
😍  😍  😍  😍                      487
🔥  🔥                            478
🔥  🔥  🔥  🔥                      404
lb                              397
❤ ️ ❤ ️ ❤ ️ ❤ ️                 321
😍  😍  😍  😍  😍                   237
🔥  🔥  🔥  🔥  🔥                   211
❤ ️ ❤ ️ ❤ ️ ❤ ️ ❤ ️             199
first                           182
beautiful                       168
hi                              168
😂                               145
👏                               143
❤ ️ 🔥                           137
🙌                               124
😍  😍  😍  😍  😍  😍                121
❤ ️ ❤ ️ ❤ ️ ❤ ️ ❤ ️ ❤ ️         120
🔥  🔥  🔥  🔥  🔥  🔥                109
🔥  ❤ ️         

Handle slang

In [10]:
# Define dictionary with slang terminology
dict_slang = {'cb': 'comment back', 'lb': 'like back', 'fb': 'follow back', 'first': 'first comment', 
              'first only': 'like my first picture', 'row': 'like back', 'rows': 'like back', 'instant': 'like back', 
              'row for row': 'like back','lblb':'like back', 'lbbb':'like back'}

dict_slang_up = {key.upper(): value.upper() for key,value in dict_slang.items()}

# Define function to merge dictionaries
def merge(x,*y):
    r = x.copy()
    for z in y:
        r.update(z)
    return r
    
dict_slang = merge(dict_slang, dict_slang_up)

for key, value in dict_slang.items():
    df.loc[df['comment_text'] == key, 'comment_text'] = value

Drop duplicates

In [11]:
df = df.drop_duplicates(subset=['comment_text'])

print(len(df.index), "records after removing duplicates")

37339 records after removing duplicates


**Selection data in English and conveying sentiment**

Detect language

In [12]:
# Define function detect language
def detect_language_with_langdetect(column):
    lang_list = []
    for comment in column:
        try:
            lang = langdetect.detect(comment) 
        except:
            lang = "most_emoji"
        lang_list. append(lang)
    return lang_list


df['lang'] = detect_language_with_langdetect(df['comment_text'])

Separate in English, mostly emoji, and the rest of languages

In [13]:
df_en = df[df['lang'] == 'en']
df_mo = df[df['lang'] == 'most_emoji']
df_else = df[(df['lang'] != 'en') & (df['lang'] != 'most_emoji')]
print("English: ", len(df_en.index), ", mostly emoji or no language: ", len(df_mo.index), ", labelled in other languages: ", len(df_else.index))

English:  15985 , mostly emoji or no language:  4416 , labelled in other languages:  16938


**Refining language classification in df_else**

Removing non latin alphabets

In [14]:
#List of languages with non latin alphabets
non_latin_script = ['ar', 'bg', 'bn', 'el', 'fa', 'gu', 'he', 'hi', 'ja', 'ko', 'mk', 'mr', 'ne', 'ru', 'te', 'th', 'uk', 'ur', 'zh-cn', 'zh-tw']

df_else = df_else[~df_else['lang'].isin(non_latin_script)]

In [15]:
# 2nd round of detecting language of comments
df_else['lang2.0'] = detect_language_with_langdetect(df_else['comment_text'])

Appoint new language

In [16]:
df_else.loc[df_else['lang2.0'] == 'en', 'lang'] = 'en'
df_else = df_else.loc[:, df_else.columns!='lang2.0']

Count most frequent words in strings to identify English comments

In [17]:
df_else_comm = df_else['comment_text'].str.lower()
Counter(" ".join(df_else_comm).split()).most_common(100)

[('️', 4792),
 ('❤', 4644),
 ('🔥', 4570),
 ('😍', 4063),
 ('you', 1036),
 ('😂', 779),
 ('👏', 682),
 ('love', 659),
 ('so', 600),
 ('🙌', 585),
 ('i', 558),
 ('a', 540),
 ('beautiful', 523),
 ('que', 494),
 ('✨', 390),
 ('de', 380),
 ('my', 367),
 ('la', 355),
 ('🤍', 335),
 ('look', 329),
 ('me', 321),
 ('gorgeous', 319),
 ('🖤', 297),
 ('kim', 278),
 ('💋', 275),
 ('like', 252),
 ('te', 252),
 ('😘', 251),
 ('queen', 236),
 ('🥰', 233),
 ('are', 219),
 ('u', 214),
 ('stunning', 214),
 ('😮', 205),
 ('mi', 199),
 ('🌹', 199),
 ('y', 188),
 ('no', 187),
 ('girl', 182),
 ('♥', 178),
 ('kendall', 178),
 ('rows', 175),
 ('wow', 171),
 ('kylie', 170),
 ('🏻', 170),
 ('it', 166),
 ('en', 162),
 ('good', 161),
 ('your', 161),
 ('nice', 158),
 ('💕', 156),
 ('linda', 153),
 ('️\u200d', 148),
 ('hermosa', 145),
 ('😢', 144),
 ('row', 144),
 ('\u200d', 137),
 ('el', 133),
 ('looking', 133),
 ('pretty', 131),
 ('💖', 131),
 ('😭', 131),
 ('se', 130),
 ('🫶', 130),
 ('amazing', 125),
 ('tu', 123),
 ('es', 123),


Labeling comments as English if contain most common words

In [18]:
# Define list of most common words that are in English
common_en = ["you", "love", "beautiful", "look", "just", "gorgeous", "like", "stunning", "nice", "queen", "girl", "you",
            "good", "looking", "looks", "amazing", "pretty", "you're", "hot", "cute", "omg", "sexy", "very"]

common_en_up = [x.upper() for x in common_en]

common_en = common_en + common_en_up

df_else['lang2.0'] = ""

for x in common_en:
    df_else.loc[df_else['comment_text'].str.contains(x),'lang2.0'] = 'en'

counts = df_else['lang2.0'].value_counts()
print(counts)

lang2.0
      11352
en     4531
Name: count, dtype: int64


Check if labeling is correct

In [19]:
df_else[df_else['lang2.0'] == 'en'].head(50)

Unnamed: 0,post_id,comment_id,comment_text,lang,lang2.0
3,A1,10004,nice,pl,en
4,A1,10005,LIKE BACK,vi,en
19,A1,10020,i love you 😍 😍,cs,en
73,A1,10074,fashion queen ❤ ️ ❤ ️ ❤ ️,fr,en
95,A1,10096,this era 🔥 natural look I’m just obsessed 😍,et,en
97,A1,10098,nice pick!,pl,en
103,A1,10104,like back,hr,en
116,A1,10117,beautiful.,ro,en
133,A1,10134,soo good,so,en
134,A1,10135,you look beautiful,fr,en


Appoint English comments

In [20]:
df_else.loc[df_else['lang2.0'] == 'en', 'lang'] = 'en'
df_else = df_else.loc[:, df_else.columns!='lang2.0']

df_else_en = df_else[df_else['lang'] == 'en']
print(len(df_else_en.index))

4727


**Removing comments without any text or emoji from df_mo**

In [21]:
# Marks True if comment does not contain emoji
drop = []
x = True
i = 0
column = df_mo.comment_text
for comment in column:
    i = i + 1
    if emoji.emoji_count(comment) > 0:
        x = False
    else:
        x = True
    drop.append(x)  

df_mo['drop_condition'] = drop

# Check that comments dropped do not convey sentiment
df_mo[df_mo['drop_condition'] == True].tail(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mo['drop_condition'] = drop


Unnamed: 0,post_id,comment_id,comment_text,lang,drop_condition
47192,D5,57193,........,most_emoji,True
47630,D5,57631,",",most_emoji,True
47636,D5,57637,",.",most_emoji,True
47638,D5,57639,".>,",most_emoji,True
47639,D5,57640,",,,,<%>,",most_emoji,True
47645,D5,57646,".,",most_emoji,True
48417,D5,58418,ሀይ,most_emoji,True
48655,D5,58656,06?,most_emoji,True
48888,D5,58889,7250431439,most_emoji,True
49838,D5,59839,7,most_emoji,True


In [22]:
# Updating df_on

df_mo = df_mo.drop(df_mo[df_mo['drop_condition'] == True].index)
df_mo = df_mo.loc[:, df_mo.columns!='drop_condition']

print(len(df_mo.index), "are comments mostly composed by emojis.")

4306 are comments mostly composed by emojis.


**Final datasets**

In [23]:
# Merge subset in english with df_en
df_en = pd.concat([df_en, df_else_en])
df_en = df_en.sort_values(by=['comment_id'])

# Merge df_en in english with emoji 
df = pd.concat([df_en, df_mo])
df = df.sort_values(by=['comment_id'])

# Remove subset in english from df_else
df_else = df_else[df_else['lang'] != 'en']

In [24]:
print(len(df.index), "are clean comments in English or convey sentiment.")

25018 are clean comments in English or convey sentiment.


Save dataset to work on further notebooks

In [25]:
df.to_csv(r"C:\Users\nayma\AppData\Local\Programs\Python\Python312\Scripts\thesis\df.csv", index=False)  