In [2]:
import pandas as pd
import demoji
import re

In [5]:
df_comments = pd.read_pickle("../data/comments/all_comments.pkl")

In [6]:
# converting strings to datetime
df_comments['video_publish_date'] = pd.to_datetime(df_comments['video_publish_date'], format='%Y-%m-%dT%H:%M:%SZ')
df_comments['comment_publish_date'] = pd.to_datetime(df_comments['comment_publish_date'], format='%Y-%m-%dT%H:%M:%SZ')

# Extracting year, and month from the datetime column
# video
df_comments['video_year'] = df_comments['video_publish_date'].dt.year
df_comments['video_month'] = df_comments['video_publish_date'].dt.month

# comment
df_comments['comment_year'] = df_comments['comment_publish_date'].dt.year
df_comments['comment_month'] = df_comments['comment_publish_date'].dt.month

# Calculate the running month
df_comments['video_running_month'] = df_comments['video_month'] + 12 * (df_comments['video_year'] - df_comments['video_year'].min())
df_comments['comment_running_month'] = df_comments['comment_month'] + 12 * (df_comments['comment_year'] - df_comments['comment_year'].min())

# define the fixed minimum date
min_date = pd.to_datetime('2017-01-01')
# Calculate the running days
df_comments['comment_running_days'] = (df_comments['comment_publish_date'] - min_date).dt.days

# filter comments after 90 days of videos' release
# Calculate the difference in days
df_comments['days_publish_date_difference'] = (df_comments['comment_publish_date'] - df_comments['video_publish_date']).dt.days

# Filter to include only comments within 90 days of the video publish date
df_timely_comments = df_comments[df_comments['days_publish_date_difference'] <= 90]

df_timely_comments = df_timely_comments.drop(columns='days_publish_date_difference')

df_timely_comments.to_pickle("../data/comments/timely_comments.pkl")

PermissionError: [Errno 13] Permission denied: '../data/comments/timely_comments.pkl'

In [7]:
# Function to count words in a string
def word_count(text):
    return len(text.split())

In [8]:
# Filter out comments with less than 3 words
df_timely_comments = df_timely_comments[df_timely_comments['comment_text'].apply(word_count) >= 3]

In [9]:
df_timely_comments.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19402 entries, 84 to 77797
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   video_id               19402 non-null  object        
 1   video_title            19402 non-null  object        
 2   video_publish_date     19402 non-null  datetime64[ns]
 3   video_category_id      19402 non-null  object        
 4   comment_text           19402 non-null  object        
 5   comment_id             19402 non-null  object        
 6   comment_publish_date   19402 non-null  datetime64[ns]
 7   video_year             19402 non-null  int32         
 8   video_month            19402 non-null  int32         
 9   comment_year           19402 non-null  int32         
 10  comment_month          19402 non-null  int32         
 11  video_running_month    19402 non-null  int32         
 12  comment_running_month  19402 non-null  int32         
 13  comme

## Text Cleaning

In [10]:
# text cleaning functions

# remove all emojis
def remove_emojis(text):
    return demoji.replace(text, "")

# Function to normalize text
def normalize_text(text):
    text = re.sub(r"@[A-Za-z0-9_]+", " ", text)  # Remove @mentions
    text = re.sub(r"https?://[A-Za-z0-9./]+", " ", text)  # Remove URLs
    text = re.sub(r"https?", " ", text)  # Remove http/https
    text = re.sub(r'(.)\1+', r'\1', text)  # Remove repeated characters
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    return text

In [11]:
# Apply text cleaning functions
df_timely_comments['comment_text'] = df_timely_comments['comment_text'].apply(remove_emojis).apply(normalize_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_timely_comments['comment_text'] = df_timely_comments['comment_text'].apply(remove_emojis).apply(normalize_text)


In [13]:
df_timely_comments['comment_text']

84       We laugh at Sofia now but is just a mater of t...
87        quotstay in your lane girlquot strikes again LOL
103      quotthis is a god begining of my plan to domin...
114      quotDo you know where you are Sophiaquotbrbrqu...
115      this was totaly colbut Sophia was a litle bit ...
                               ...                        
77789    I found the info in this video interesting and...
77792    If you39re that impresed by the humanoid robot...
77793    While I do apreciate the story here there are ...
77796    I wanted to share this experience I had a few ...
77797    Thank you for this wonderful video I have incu...
Name: comment_text, Length: 19402, dtype: object

In [18]:
# import packages for removing stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishwa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [25]:
# Functions to remove stop words

# collect all the stopwords
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    words = text.split()
    # if word is not in stop_words, append it to the list and lower the words
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    joined_words = ' '.join(filtered_words)
    
    return joined_words

# test on small data
#text = 'Hey, there is no such thing like bubble tea'
#remove_stop_words(text)

'hey, thing like bubble tea'

In [26]:
# Apply stopword removal
df_timely_comments['comment_text'] = df_timely_comments['comment_text'].apply(remove_stop_words)

In [30]:
df_timely_comments['comment_text']

84       laugh sofia mater time tech gets god canot dif...
87                      quotstay lane girlquot strikes lol
103      quotthis god begining plan dominate human race...
114           quotdo know sophiaquotbrbrquoti39m dreamquot
115              totaly colbut sophia litle bit crepy sasy
                               ...                        
77789    found info video interesting frightening fear ...
77792    you39re impresed humanoid robot featured must ...
77793    apreciate story asumptions misinterpretations ...
77796    wanted share experience weks ago invited two n...
77797    thank wonderful video incured much loses tradi...
Name: comment_text, Length: 19402, dtype: object