In [46]:
import pandas as pd
import demoji
import re

In [None]:
df_comments = pd.read_pickle("../data/comments/all_comments.pkl")

In [None]:
# converting strings to datetime
df_comments['video_publish_date'] = pd.to_datetime(df_comments['video_publish_date'], format='%Y-%m-%dT%H:%M:%SZ')
df_comments['comment_publish_date'] = pd.to_datetime(df_comments['comment_publish_date'], format='%Y-%m-%dT%H:%M:%SZ')

# Extracting year, and month from the datetime column
# video
df_comments['video_year'] = df_comments['video_publish_date'].dt.year
df_comments['video_month'] = df_comments['video_publish_date'].dt.month

# comment
df_comments['comment_year'] = df_comments['comment_publish_date'].dt.year
df_comments['comment_month'] = df_comments['comment_publish_date'].dt.month

# Calculate the running month
df_comments['video_running_month'] = df_comments['video_month'] + 12 * (df_comments['video_year'] - df_comments['video_year'].min())
df_comments['comment_running_month'] = df_comments['comment_month'] + 12 * (df_comments['comment_year'] - df_comments['comment_year'].min())

# define the fixed minimum date
min_date = pd.to_datetime('2017-01-01')
# Calculate the running days
df_comments['comment_running_days'] = (df_comments['comment_publish_date'] - min_date).dt.days

# filter comments after 90 days of videos' release
# Calculate the difference in days
df_comments['days_publish_date_difference'] = (df_comments['comment_publish_date'] - df_comments['video_publish_date']).dt.days

# Filter to include only comments within 90 days of the video publish date
df_timely_comments = df_comments[df_comments['days_publish_date_difference'] <= 90]

df_timely_comments = df_timely_comments.drop(columns='days_publish_date_difference')

df_timely_comments.to_pickle("../data/comments/timely_comments.pkl")

In [47]:
# Function to count words in a string
def word_count(text):
    return len(text.split())

In [None]:
# Filter out comments with less than 3 words
df_timely_comments = df_timely_comments[df_timely_comments['comment_text'].apply(word_count) >= 3]

In [45]:
df_timely_comments.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19363 entries, 84 to 77797
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   video_id               19363 non-null  object        
 1   video_title            19363 non-null  object        
 2   video_publish_date     19363 non-null  datetime64[ns]
 3   video_category_id      19363 non-null  object        
 4   comment_text           19363 non-null  object        
 5   comment_id             19363 non-null  object        
 6   comment_publish_date   19363 non-null  datetime64[ns]
 7   video_year             19363 non-null  int32         
 8   video_month            19363 non-null  int32         
 9   comment_year           19363 non-null  int32         
 10  comment_month          19363 non-null  int32         
 11  video_running_month    19363 non-null  int32         
 12  comment_running_month  19363 non-null  int32         
 13  comme

## Text Cleaning

In [None]:
# text cleaning functions

# remove all emojis
def remove_emojis(text):
    return demoji.replace(text, "")

# Function to normalize text
def normalize_text(text):
    text = re.sub(r"@[A-Za-z0-9_]+", " ", text)  # Remove @mentions
    text = re.sub(r"https?://[A-Za-z0-9./]+", " ", text)  # Remove URLs
    text = re.sub(r"https?", " ", text)  # Remove http/https
    text = re.sub(r'(.)\1+', r'\1', text)  # Remove repeated characters
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    return text

In [49]:
# Apply text cleaning functions
df_timely_comments['comment_text'] = df_timely_comments['comment_text'].apply(remove_emojis).apply(normalize_text)


In [51]:
df_timely_comments['comment_text'].head(50)

84      We laugh at Sofia now but is just a mater of t...
87       quotstay in your lane girlquot strikes again LOL
103     quotthis is a god begining of my plan to domin...
114     quotDo you know where you are Sophiaquotbrbrqu...
115     this was totaly colbut Sophia was a litle bit ...
122     At a href vBg_tJvCA8zwampt298458a why do peopl...
126     this wil be a highlight rel in the future wher...
127     quotDominating the human racequotbrJust kiding...
130     when she said quotThis is a god begining of my...
135                     Sophia is the start of terminator
146     a href vBg_tJvCA8zwampt12152a quotIt39s strong...
158     Funy how humans can be so afraid of what they ...
171     Sophia creped me out She sems more real than s...
209     a href vBg_tJvCA8zwampt25345a fuck this shit I...
229     a href vBg_tJvCA8zwampt403643a ME TRYING TO GE...
261      Thank you Jimy friend me on Facebok HAHAHAHAHAHA
288     Who else was thinking about Ex Machina when th...
293     I love