In [3]:
import re
import pandas as pd
import os
import matplotlib.pyplot as plt
from collections import Counter

# Define the emoji pattern
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F"  # Emoticons
    "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
    "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
    "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
    "\U00002702-\U000027B0"  # Miscellaneous Symbols
    "\U000024C2-\U0001F251"  # Enclosed characters
    "]+", flags=re.UNICODE
)

# Folder containing CSV files
folder_path = './comments'

# Combine all CSV files into a single DataFrame
all_data = []
for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        video_data = pd.read_csv(os.path.join(folder_path, file), on_bad_lines='skip', encoding='utf-8', engine='python')
        video_data['video'] = file  # Add a column for video identifier
        all_data.append(video_data)
df = pd.concat(all_data, ignore_index=True)

df


Unnamed: 0,user_id,username,comment,likes,replies,date,video
0,UCSnDqnHH_BF5LR7BgaHLGLw,@monicaruiz5531,Hace unos dias capte de que trata la canción y...,0,0.0,2024-10-20T17:02:26Z,"El Chombo, Dancing Green Alien - Dame Tu Cosit..."
1,UCbPEjYkUYFTRkcpdQHWGDlA,@GRIBSEXXX69,October 2024?,0,0.0,2024-10-20T16:56:36Z,"El Chombo, Dancing Green Alien - Dame Tu Cosit..."
2,UCri9d_8pPQ8RwoSF8YFKRgw,@sandrieleconceicaodasilvas459,💚♦️🤑💋👏♥️👏,0,0.0,2024-10-20T16:44:34Z,"El Chombo, Dancing Green Alien - Dame Tu Cosit..."
3,UCpIjd0QKKmaRnDqbEBHVAQw,@SooHyun-w4o,Anyone in October 2024??,0,0.0,2024-10-20T16:39:12Z,"El Chombo, Dancing Green Alien - Dame Tu Cosit..."
4,UCOqFrJr43TLSiL5i6qUL9Gw,@ninaadam8844,Me in 2018 🕺💃\nMe in 2024 seeing the lyrics 🫢💀,0,0.0,2024-10-20T16:24:58Z,"El Chombo, Dancing Green Alien - Dame Tu Cosit..."
...,...,...,...,...,...,...,...
4993491,UC8B55a_AC5b7HpjgOpO6APA,@VarunSingh-u5g,Jai hanuman ji ki jai ho,4,0.0,2024-09-16T01:53:38Z,श्री हनुमान चालीसा 🌺🙏 Shree Hanuman Chalisa Or...
4993492,UCVlDJ2K0laNkITQ8uKkvqUQ,@JoruBarvad,❤❤❤❤❤❤❤❤❤❤😊❤😊❤😊😊😊❤😊❤😊❤😊😊❤❤,2,0.0,2024-09-16T01:53:27Z,श्री हनुमान चालीसा 🌺🙏 Shree Hanuman Chalisa Or...
4993493,UCNKEyFUHh8e9MKtCox_Fobg,@ArvindSinghOfficial-rj2bj,Hanuman chalisa hamesha suna jayega or sunaeng...,1,0.0,2024-09-16T01:53:23Z,श्री हनुमान चालीसा 🌺🙏 Shree Hanuman Chalisa Or...
4993494,UCG_HLztWfugeyc1Gs6TGA3A,@AjaySingh-x9l,Jai shree ram 🙏❤️🥰🤗,3,0.0,2024-09-16T01:52:56Z,श्री हनुमान चालीसा 🌺🙏 Shree Hanuman Chalisa Or...


In [22]:
emoji_pattern = re.compile(
    r'[\U0001F1E6-\U0001F1FF]|'  # Flags
    r'[\U0001F300-\U0001F5FF]|'  # Symbols & Pictographs
    r'[\U0001F600-\U0001F64F]|'  # Emoticons
    r'[\U0001F680-\U0001F6FF]|'  # Transport & Map Symbols
    r'[\U0001F700-\U0001F77F]|'  # Alchemical Symbols
    r'[\U0001F780-\U0001F7FF]|'  # Geometric Shapes Extended
    r'[\U0001F800-\U0001F8FF]|'  # Supplemental Arrows-C
    r'[\U0001F900-\U0001F9FF]|'  # Supplemental Symbols and Pictographs
    r'[\U0001FA70-\U0001FAFF]|'  # Symbols and Pictographs Extended-A
    r'[\U00002700-\U000027BF]|'  # Dingbats
    r'[\U0001F000-\U0001FFFF]|'  # Miscellaneous Symbols
    r'\uFE0F|'                   # Variation Selector
    r'\u200D'                    # Zero Width Joiner
, flags=re.UNICODE)

In [28]:
# Extract emojis using emoji_pattern
df['comment'] = df['comment'].fillna('').astype(str)
df['emojis'] = df['comment'].apply(lambda x: ''.join(emoji_pattern.findall(x)))
df['emoji_counts'] = df['emojis'].apply(len)

# Most Used Emoji
emoji_counts = Counter(''.join(df['emojis']))


most_used_emojis = pd.DataFrame(emoji_counts.most_common(11), columns=['Emoji', 'Count'])
most_used_emojis = most_used_emojis[most_used_emojis['Emoji'] != '️']

# Take the top 10 valid emojis
most_used_emojis = most_used_emojis.head(10)
# Example: Plot Top 10 Most Used Emojis
import plotly.express as px

# Plot using Plotly
fig = px.bar(
    most_used_emojis,
    x='Emoji',
    y='Count',
    title='Top 10 Most Used Emojis',
    labels={'Emoji': 'Emoji', 'Count': 'Count'},
    text='Count',
    template='plotly_white'
)
fig.update_layout(font_family="Arial Unicode MS")  # Set a font supporting emojis
fig.show()

In [30]:
df['likes'] = pd.to_numeric(df['likes'], errors='coerce').fillna(0).astype(int)
# Create a list of tuples (emoji, likes) for each comment
emoji_likes = []
for emojis, likes in zip(df['emojis'], df['likes']):
    for emoji in emojis:
        emoji_likes.append((emoji, likes))

# Convert to DataFrame for aggregation
emoji_likes_df = pd.DataFrame(emoji_likes, columns=['Emoji', 'Likes'])

# Sum likes for each emoji
most_liked_emojis = emoji_likes_df.groupby('Emoji', as_index=False).sum().sort_values(by='Likes', ascending=False)

# Take top 10 most liked emojis
most_liked_emojis = most_liked_emojis[most_liked_emojis['Emoji'] != '️']

# Take the top 10 valid emojis
most_liked_emojis = most_liked_emojis.head(10)
# Plot using Plotly
fig = px.bar(
    most_liked_emojis,
    x='Emoji',
    y='Likes',
    title='Top 10 Most Liked Emojis',
    labels={'Emoji': 'Emoji', 'Likes': 'Total Likes'},
    text='Likes',
    template='plotly_white',
    color_discrete_sequence=['skyblue']
)
fig.update_layout(font_family="Arial Unicode MS")  # Set a font supporting emojis
fig.show()

In [36]:
df['replies'] = pd.to_numeric(df['replies'], errors='coerce').fillna(0).astype(int)
# Create a list of tuples (emoji, likes) for each comment
emoji_replies = []
for emojis, replies in zip(df['emojis'], df['replies']):
    for emoji in emojis:
        emoji_replies.append((emoji, replies))

# Convert to DataFrame for aggregation
emoji_replies_df = pd.DataFrame(emoji_replies, columns=['Emoji', 'Replies'])

# Sum likes for each emoji
most_replies_emojis = emoji_replies_df.groupby('Emoji', as_index=False).sum().sort_values(by='Replies', ascending=False)

# Take top 10 most liked emojis
most_replies_emojis = most_replies_emojis[most_replies_emojis['Emoji'] != '️']

# Take the top 10 valid emojis
most_replies_emojis = most_replies_emojis.head(10)
# Plot using Plotly
fig = px.bar(
    most_replies_emojis,
    x='Emoji',
    y='Replies',
    title='Top 10 Most Replied Emojis',
    labels={'Emoji': 'Emoji', 'Replies': 'Total Replies'},
    text='Replies',
    template='plotly_white',
    color_discrete_sequence=['yellow']
)
fig.update_layout(font_family="Arial Unicode MS")  # Set a font supporting emojis
fig.show()