In [26]:
import pandas as pd
import demoji
from langdetect import detect, DetectorFactory
import re

#from nltk.tokenize import word_tokenize
#from nltk.corpus import stopwords
#from textblob import TextBlob

In [27]:
df_comments = pd.read_pickle("../data/comments/all_comments.pkl")

In [28]:
# converting strings to datetime
df_comments['video_publish_date'] = pd.to_datetime(df_comments['video_publish_date'], format='%Y-%m-%dT%H:%M:%SZ')
df_comments['comment_publish_date'] = pd.to_datetime(df_comments['comment_publish_date'], format='%Y-%m-%dT%H:%M:%SZ')

# Extracting year, and month from the datetime column
# video
df_comments['video_year'] = df_comments['video_publish_date'].dt.year
df_comments['video_month'] = df_comments['video_publish_date'].dt.month

# comment
df_comments['comment_year'] = df_comments['comment_publish_date'].dt.year
df_comments['comment_month'] = df_comments['comment_publish_date'].dt.month

# define the fixed minimum date
min_date = pd.to_datetime('2017-01-01')

# Calculate the running month
df_comments['video_running_month'] = df_comments['video_month'] + 12 * (df_comments['video_year'] - min_date.year)
df_comments['comment_running_month'] = df_comments['comment_month'] + 12 * (df_comments['comment_year'] - min_date.year)

# Calculate the running days
df_comments['comment_running_days'] = (df_comments['comment_publish_date'] - min_date).dt.days

# filter comments after 90 days of videos' release
# Calculate the difference in days
df_comments['days_publish_date_difference'] = (df_comments['comment_publish_date'] - df_comments['video_publish_date']).dt.days

# Filter to include only comments within 90 days of the video publish date
df_timely_comments = df_comments[df_comments['days_publish_date_difference'] <= 90]

df_timely_comments = df_timely_comments.drop(columns='days_publish_date_difference')


In [4]:
# Save only timely comments to pickle format
df_timely_comments.to_pickle("../data/comments/timely_comments.pkl")

In [29]:
# Function to count words in a string
def word_count(text):
    return len(text.split())

# Filter out comments with less than 3 words
df_timely_comments = df_timely_comments[df_timely_comments['comment_text'].apply(word_count) >= 3]

#df_timely_comments.info()

## Text Cleaning

In [30]:
# Text Cleaning Functions

# remove all emojis
def remove_emojis(text):
    return demoji.replace(text, "")

# Function to normalize text (NOTE: If creating R dataframe, comment out the # Remove punctuations part)
def normalize_text(text):
    text = re.sub(r"@[A-Za-z0-9_]+", " ", text)  # Remove @mentions
    text = re.sub(r"&quot;", "", text) # Remove instances of &quot;
    text = re.sub(r"&#39;", "'", text) # Replace all instances of &#39; with '
    text = re.sub(r"<[^>]*>", " ", text) # Remove all HTML tags
    text = re.sub(r"https?://[A-Za-z0-9./]+", " ", text)  # Remove URLs
    text = re.sub(r"https?", " ", text)  # Remove http/https
    text = re.sub(r'(.)\1+', r'\1', text)  # Remove repeated characters
    #text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    return text

# Function to detect if language is English
# ensure consistent results
DetectorFactory.seed = 0
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False


# # Function to correct spelling errors 
# def correct_spelling(text):
#     blob = TextBlob(text)
#     corrected_text = blob.correct()
#     return str(corrected_text)

# collect all the stopwords
# stop_words = set(stopwords.words('english'))

# def remove_stop_words(text):
#     words = text.split()
#     # if word is not in stop_words, append it to the list and lower the words
#     filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
#     joined_words = ' '.join(filtered_words)
    
#     return joined_words


In [31]:
# Apply text cleaning functions (NOTE: If creating R Dataframe comment out the applicaton of remove_stop_words())
df_timely_comments['comment_text'] = df_timely_comments['comment_text'].apply(remove_emojis)
df_timely_comments['comment_text'] = df_timely_comments['comment_text'].apply(normalize_text)

#df_timely_comments['comment_text'] = df_timely_comments['comment_text'].apply(correct_spelling)
#df_timely_comments['comment_text'] = df_timely_comments['comment_text'].apply(remove_stop_words)

In [32]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', None)  # Set the max column width to unlimited

# Apply the is_english() function to the 'comment_text' column
df_timely_comments = df_timely_comments[df_timely_comments['comment_text'].apply(is_english)]
print(df_timely_comments['comment_text'])


84       We laugh at Sofia now, but is just a mater of ...
103      this is a god begining of my plan to dominate ...
114      Do you know where you are Sophia? I'm in a dream.
115      this was totaly col.but Sophia was a litle bit...
122               At 4:58 why do people think it's a joke.
                               ...                        
77789    I found the info in this video interesting and...
77792    If you're that impresed by the humanoid robot ...
77793    While I do apreciate the story here, there are...
77796    I wanted to share this experience I had a few ...
77797    Thank you for this wonderful video! I have inc...
Name: comment_text, Length: 18505, dtype: object


In [33]:
df_filtered_text_cleaned = df_timely_comments
# Save df with cleaned text
df_filtered_text_cleaned.to_pickle("../data/comments/df-filtered_text-cleaned_comments.pkl")

In [34]:
df_filtered_text_cleaned = pd.read_pickle("../data/comments/df-filtered_text-cleaned_comments.pkl")

In [35]:
df_filtered_text_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18505 entries, 84 to 77797
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   video_id               18505 non-null  object        
 1   video_title            18505 non-null  object        
 2   video_publish_date     18505 non-null  datetime64[ns]
 3   video_category_id      18505 non-null  object        
 4   comment_text           18505 non-null  object        
 5   comment_id             18505 non-null  object        
 6   comment_publish_date   18505 non-null  datetime64[ns]
 7   video_year             18505 non-null  int32         
 8   video_month            18505 non-null  int32         
 9   comment_year           18505 non-null  int32         
 10  comment_month          18505 non-null  int32         
 11  video_running_month    18505 non-null  int32         
 12  comment_running_month  18505 non-null  int32         
 13  comme

## Filter Text with Keywords

In [36]:
# Define AI-related keywords
ai_keywords = [
    'artificial intelligence', 'ai', 'a.i.', 'a.i', 'a. i', 'a i.', 'a i', 'machine learning', 'ml', 
    'deep learning', 'neural networks', 'large language model', 'language model', 'supervised learning', 
    'unsupervised learning', 'self-driving', 'self driving', 'image recognition', 'speech recognition', 
    'automation', 'turing test', 'agi', 'artificial general intelligence', 'ani', 'artificial narrow intelligence', 
    'asi', 'artificial superintelligence', 'algorithm', 'intelligent agent', 'data mining', 'data science', 
    'computer science', 'intelligent system', 'predictive modeling', 'quantum computing', 'virtual assistant', 
    'bot', 'robot', 'gpt', 'bard', 'gemini', 'chatgpt', 'transformer', 'openai', 'dalle', 'stable diffusion', 
    'meta', 'microsoft', 'siri', 'technology', 'terminator', 'skynet', 'prompt', 'copilot'
]

# Compile regular expressions for all keywords with word boundaries
keyword_patterns = [re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE) for keyword in ai_keywords]

# Function to check if comment contains any AI-related keywords
def contains_keywords(text):
    return any(pattern.search(text) for pattern in keyword_patterns)
    #return any(keyword.lower() in text.lower() for keyword in ai_keywords)

In [37]:
# Filter comments containing AI-related keywords
df_filtered_text_keywordfiltered_and_cleaned = df_filtered_text_cleaned[df_filtered_text_cleaned['comment_text'].apply(contains_keywords)] 

df_filtered_text_keywordfiltered_and_cleaned['comment_text']


135                      Sophia is the start of terminator
444      Y'al didn't believe Trump when he said he woul...
472      Asians want human like robots. Europeans want ...
509         Wait, which one is the robot? Falon or Sophia?
1185     PART 1? THERE WIL BE MORE? YAS 3BLUE1BROWN IS ...
                               ...                        
77780    I enjoyed this video. The video made me believ...
77788    Honestly, it would be very interesting to se a...
77789    I found the info in this video interesting and...
77792    If you're that impresed by the humanoid robot ...
77793    While I do apreciate the story here, there are...
Name: comment_text, Length: 6286, dtype: object

In [38]:
# Save filtered (1) Dataframe to CSV 
df_filtered_text_keywordfiltered_and_cleaned.to_csv("../data/comments/R/filtered_and_cleaned_comments.csv", index = False)