In [19]:
import pandas as pd
import pyarrow.feather as feather
import demoji
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from langdetect import detect, DetectorFactory
import re


In [2]:
df_comments = pd.read_pickle("../data/comments/all_comments.pkl")

In [3]:
# converting strings to datetime
df_comments['video_publish_date'] = pd.to_datetime(df_comments['video_publish_date'], format='%Y-%m-%dT%H:%M:%SZ')
df_comments['comment_publish_date'] = pd.to_datetime(df_comments['comment_publish_date'], format='%Y-%m-%dT%H:%M:%SZ')

# Extracting year, and month from the datetime column
# video
df_comments['video_year'] = df_comments['video_publish_date'].dt.year
df_comments['video_month'] = df_comments['video_publish_date'].dt.month

# comment
df_comments['comment_year'] = df_comments['comment_publish_date'].dt.year
df_comments['comment_month'] = df_comments['comment_publish_date'].dt.month

# define the fixed minimum date
min_date = pd.to_datetime('2017-01-01')

# Calculate the running month
df_comments['video_running_month'] = df_comments['video_month'] + 12 * (df_comments['video_year'] - df_comments['video_year'].min())
df_comments['comment_running_month'] = df_comments['comment_month'] + 12 * (df_comments['comment_year'] - df_comments['comment_year'].min())

# Calculate the running days
df_comments['comment_running_days'] = (df_comments['comment_publish_date'] - min_date).dt.days

# filter comments after 90 days of videos' release
# Calculate the difference in days
df_comments['days_publish_date_difference'] = (df_comments['comment_publish_date'] - df_comments['video_publish_date']).dt.days

# Filter to include only comments within 90 days of the video publish date
df_timely_comments = df_comments[df_comments['days_publish_date_difference'] <= 90]

df_timely_comments = df_timely_comments.drop(columns='days_publish_date_difference')


In [4]:
df_timely_comments.to_pickle("../data/comments/timely_comments.pkl")

In [5]:
# Function to count words in a string
def word_count(text):
    return len(text.split())

In [6]:
# Filter out comments with less than 3 words
df_timely_comments = df_timely_comments[df_timely_comments['comment_text'].apply(word_count) >= 3]

In [7]:
df_timely_comments.info()
df_timely_comments

<class 'pandas.core.frame.DataFrame'>
Index: 19402 entries, 84 to 77797
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   video_id               19402 non-null  object        
 1   video_title            19402 non-null  object        
 2   video_publish_date     19402 non-null  datetime64[ns]
 3   video_category_id      19402 non-null  object        
 4   comment_text           19402 non-null  object        
 5   comment_id             19402 non-null  object        
 6   comment_publish_date   19402 non-null  datetime64[ns]
 7   video_year             19402 non-null  int32         
 8   video_month            19402 non-null  int32         
 9   comment_year           19402 non-null  int32         
 10  comment_month          19402 non-null  int32         
 11  video_running_month    19402 non-null  int32         
 12  comment_running_month  19402 non-null  int32         
 13  comme

Unnamed: 0,video_id,video_title,video_publish_date,video_category_id,comment_text,comment_id,comment_publish_date,video_year,video_month,comment_year,comment_month,video_running_month,comment_running_month,comment_running_days
84,Bg_tJvCA8zw,Tonight Showbotics: Jimmy Meets Sophia the Hum...,2017-04-26 03:57:12,23,"We laugh at Sofia now, but is just a matter of...",Uggx5fEEMrzsYHgCoAEC,2017-04-26 10:35:26,2017,4,2017,4,4,4,115
87,Bg_tJvCA8zw,Tonight Showbotics: Jimmy Meets Sophia the Hum...,2017-04-26 03:57:12,23,&quot;stay in your lane girl&quot; strikes aga...,UggnYKq-trQl63gCoAEC,2017-04-26 11:22:16,2017,4,2017,4,4,4,115
103,Bg_tJvCA8zw,Tonight Showbotics: Jimmy Meets Sophia the Hum...,2017-04-26 03:57:12,23,&quot;this is a good begining of my plan to do...,Ugif2hNhsM8WEngCoAEC,2017-05-04 01:46:28,2017,4,2017,5,4,5,123
114,Bg_tJvCA8zw,Tonight Showbotics: Jimmy Meets Sophia the Hum...,2017-04-26 03:57:12,23,&quot;Do you know where you are Sophia?&quot;<...,Ugjd-6BJCDwa4ngCoAEC,2017-07-04 00:42:58,2017,4,2017,7,4,7,184
115,Bg_tJvCA8zw,Tonight Showbotics: Jimmy Meets Sophia the Hum...,2017-04-26 03:57:12,23,this was totally cool...but Sophia was a littl...,UgilTmlEk2XHo3gCoAEC,2017-04-26 04:01:05,2017,4,2017,4,4,4,115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77789,ABHz5oZx-WA,"I Asked AI about the Second Coming of Jesus, a...",2023-06-05 22:33:59,27,I found the info in this video interesting and...,UgzGWD7FVNAgABr3FJx4AaABAg,2023-06-12 13:31:20,2023,6,2023,6,78,78,2353
77792,ABHz5oZx-WA,"I Asked AI about the Second Coming of Jesus, a...",2023-06-05 22:33:59,27,If you&#39;re that impressed by the humanoid r...,Ugy8XkAGvtdD8r9sbj14AaABAg,2023-07-19 16:33:28,2023,6,2023,7,78,79,2390
77793,ABHz5oZx-WA,"I Asked AI about the Second Coming of Jesus, a...",2023-06-05 22:33:59,27,"While I do appreciate the story here, there ar...",UgyGST9Bzhl2K-5fB_R4AaABAg,2023-06-22 21:06:31,2023,6,2023,6,78,78,2363
77796,ABHz5oZx-WA,"I Asked AI about the Second Coming of Jesus, a...",2023-06-05 22:33:59,27,I wanted to share this experience I had a few ...,UgwI1EYOxa1-0E7TGWR4AaABAg,2023-07-08 01:01:37,2023,6,2023,7,78,79,2379


## Text Cleaning

In [15]:
# text cleaning functions

# remove all emojis
def remove_emojis(text):
    return demoji.replace(text, "")

# Function to normalize text (NOTE: If creating R dataframe, comment out the # Remove punctuations part)
def normalize_text(text):
    text = re.sub(r"@[A-Za-z0-9_]+", " ", text)  # Remove @mentions
    text = re.sub(r"&quot;", "", text) # Remove instances of &quot;
    text = re.sub(r"&#39;", "'", text) # Replace all instances of &#39; with '
    text = re.sub(r"<[^>]*>", " ", text) # Remove all HTML tags
    text = re.sub(r"https?://[A-Za-z0-9./]+", " ", text)  # Remove URLs
    text = re.sub(r"https?", " ", text)  # Remove http/https
    text = re.sub(r'(.)\1+', r'\1', text)  # Remove repeated characters
    #text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    return text

# Function to correct spelling errors 
def correct_spelling(text):
    blob = TextBlob(text)
    corrected_text = blob.correct()
    return str(corrected_text)

# collect all the stopwords
stop_words = set(stopwords.words('english'))

# Function to detect if language is English
# ensure consistent results
DetectorFactory.seed = 0
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

def remove_stop_words(text):
    words = text.split()
    # if word is not in stop_words, append it to the list and lower the words
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    joined_words = ' '.join(filtered_words)
    
    return joined_words

# test on small data
#text = 'Hey, there is no such thing like bubble tea'
#remove_stop_words(text)

In [10]:
# Apply text cleaning functions (NOTE: If creating R Dataframe comment out the applicaton of remove_stop_words())
df_timely_comments['comment_text'] = df_timely_comments['comment_text'].apply(remove_emojis)
df_timely_comments['comment_text'] = df_timely_comments['comment_text'].apply(normalize_text)
df_timely_comments['comment_text'] = df_timely_comments['comment_text'].apply(correct_spelling)

#df_timely_comments['comment_text'] = df_timely_comments['comment_text'].apply(remove_stop_words)

In [17]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', None)  # Set the max column width to unlimited
# Apply the is_english() function to the 'comment_text' column
df_timely_comments = df_timely_comments[df_timely_comments['comment_text'].apply(is_english)]
print(df_timely_comments['comment_text'])


84       He laugh at Sofa now, but is just a mater of t...
103      this is a god beginning of my plan to dominate...
114      To you know where you are Sophia? I'm in a dream.
115      this was total col.but Sophia was a little bit...
122               It 4:58 why do people think it's a joke.
                               ...                        
77789    I found the into in this video interesting and...
77792    Of you're that impressed by the humanoid root ...
77793    While I do appreciate the story here, there ar...
77796    I wanted to share this experience I had a few ...
77797    Thank you for this wonderful video! I have inj...
Name: comment_text, Length: 18654, dtype: object


In [1]:
df_filter1_cleaned = df_timely_comments
df_filter1_cleaned.to_pickle("../data/comments/filter1_cleaned_comments.pkl")

NameError: name 'df_timely_comments' is not defined

In [3]:
df_filter1_cleaned = pd.read_pickle("../data/comments/filter1_cleaned_comments.pkl")

In [22]:
# Assuming your Pandas DataFrame is named 'df'
# pandas2ri.activate()

# r_dataframe = pandas2ri.py2ri(df_filter1_cleaned)

# Save the R DataFrame to an RData file
#ro.r['save'](r_dataframe, file="../data/comments/R/data.RData")

df_filter1_cleaned.to_csv("../data/comments/R/data.csv", index = False)

In [23]:
df_filter1_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18654 entries, 84 to 77797
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   video_id               18654 non-null  object        
 1   video_title            18654 non-null  object        
 2   video_publish_date     18654 non-null  datetime64[ns]
 3   video_category_id      18654 non-null  object        
 4   comment_text           18654 non-null  object        
 5   comment_id             18654 non-null  object        
 6   comment_publish_date   18654 non-null  datetime64[ns]
 7   video_year             18654 non-null  int32         
 8   video_month            18654 non-null  int32         
 9   comment_year           18654 non-null  int32         
 10  comment_month          18654 non-null  int32         
 11  video_running_month    18654 non-null  int32         
 12  comment_running_month  18654 non-null  int32         
 13  comme

## Filter Text with Keywords

In [28]:
# Define AI-related keywords
ai_keywords = ['artificial intelligence', 'machine learning', 'neural networks', 'deep learning', 'automation', 'ai', 'a.i.', 'robot', 'sophia', 'gpt', 'bard', 'gemini', 'ml', 'big data', 'large language model', 'natural language processing', 'augmented intelligence', 'prompt', 'chatgpt', 'dalee', 'stabel diffusion', 'bot', 'terminator', 'skynet']
# Function to check if comment contains any AI-related keywords
def contains_keywords(text):
    return any(keyword in text.lower() for keyword in ai_keywords)

In [32]:
# Filter comments containing AI-related keywords

df_test = df_filter1_cleaned[df_filter1_cleaned['comment_text'].apply(contains_keywords)]  

#print(df_filter1_cleaned['comment_text_keyword_filtered'])

print(df_test['comment_text'])



114      To you know where you are Sophia? I'm in a dream.
115      this was total col.but Sophia was a little bit...
130      when she said His is a god beginning of my pla...
135                     Sophia is the start of termination
158      Puny how humans can be so afraid of what they ...
                               ...                        
77777    His is going to be a big problem when of advan...
77780    I enjoyed this video. The video made me believ...
77792    Of you're that impressed by the humanoid root ...
77793    While I do appreciate the story here, there ar...
77796    I wanted to share this experience I had a few ...
Name: comment_text, Length: 4973, dtype: object
