In [2]:
import time
import csv
import pandas as pd
from selenium.webdriver import Chrome
import chromedriver_autoinstaller
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import nltk
nltk.downloader.download('vader_lexicon')

# Guardar en CSV
with open("/mnt/c/Users/leoni/Desktop/Proyecto_webScrapping_twitter/tweets.csv", "w", newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Tweet_count", "Username", "Text", "Created At", "Retweets", "Likes"])


def process_youtube_comments(youtube_video_url, category):
    data = []
    user = []
    timer = []

    chromedriver_autoinstaller.install()
    with Chrome() as driver:
        wait = WebDriverWait(driver, 10)
        driver.get(youtube_video_url)

        for item in range(4):  # By increasing the highest range you can get more content
            wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
            time.sleep(3)

        for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#comment #content-text"))):
            data.append(comment.text)

        for author in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#header-author #author-text"))):
            user.append(author.text)

        for times in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#header-author #published-time-text"))):
            timer.append(times.text)

    df = pd.DataFrame(data, columns=['comment'])
    df_ = pd.DataFrame(user, columns=['author'])
    df_time = pd.DataFrame(timer, columns=['time'])
    
    # Combine the dataframes
    combined_df = pd.concat([df, df_, df_time], axis=1).dropna()
    
    # Language settings
    idioma = "english"
    stop_words = stopwords.words(idioma)
    porter_stemmer = PorterStemmer()
    lancaster_stemmer = LancasterStemmer()
    snowball_stemmer = SnowballStemmer(language=idioma)
    lzr = WordNetLemmatizer()

    def text_processing(text):
        # Convierte todo en string
        text = str(text)
        
        # Conver text in lower
        text = text.lower()

        # remove new line characters in text
        text = re.sub(r'\n',' ', text)
        
        # remove punctuations from text
        text = re.sub('[%s]' % re.escape(punctuation), "", text)
        
        # remove references and hashtags from text
        text = re.sub("^a-zA-Z0-9$,.", "", text)
        
        # remove multiple spaces from text
        text = re.sub(r'\s+', ' ', text, flags=re.I)
        
        # remove special characters from text
        text = re.sub(r'\W', ' ', text)

        text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])
        
        # lemmatizer using WordNetLemmatizer from nltk package
        text = ' '.join([lzr.lemmatize(word) for word in word_tokenize(text)])

        return text


    combined_df['comment'] = combined_df['comment'].apply(text_processing)

    # Sentiment analysis
    sentiment = SentimentIntensityAnalyzer()
    combined_df["Positive"] = [sentiment.polarity_scores(i)["pos"] for i in combined_df['comment']]
    combined_df["Negative"] = [sentiment.polarity_scores(i)["neg"] for i in combined_df['comment']]
    combined_df["Neutral"] = [sentiment.polarity_scores(i)["neu"] for i in combined_df['comment']]
    combined_df["Compound"] = [sentiment.polarity_scores(i)["compound"] for i in combined_df['comment']]

    neg = combined_df["Negative"].values
    pos = combined_df["Positive"].values

    # Determinar el sentimiento basado en las puntuaciones de negativo y positivo
    sentiments = []
    for n, p in zip(neg, pos):
        if n > p:
            sentiments.append('Negative')
        elif p > n:
            sentiments.append('Positive')
        else:
            sentiments.append('Neutral')

    combined_df["Sentiment"] = sentiments

    filtered_df = combined_df[combined_df["comment"].str.contains("|".join(category))]
    filtered_df = combined_df[combined_df['Sentiment'] == 'Negative']
    filtered_df = filtered_df.drop(columns=["Positive", "Negative", "Neutral", "Compound"])
    
    

    return filtered_df

# Uso de la función
youtube_video_url = "https://www.youtube.com/watch?v=hmL8al8twIE"
category = ['trump', 'kamala', 'president', 'united states', 'elections', 'usa', 'winner', 'loser']
filtered_df = process_youtube_comments(youtube_video_url, category)
print(filtered_df)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/scavenger/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


WebDriverException: Message: disconnected: received Inspector.detached event
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=123.0.6312.122)
Stacktrace:
#0 0x561e876c4863 <unknown>
#1 0x561e873ba8c6 <unknown>
#2 0x561e873a36f6 <unknown>
#3 0x561e873a2e8a <unknown>
#4 0x561e873a2638 <unknown>
#5 0x561e873a2485 <unknown>
#6 0x561e873a0516 <unknown>
#7 0x561e873a0a2a <unknown>
#8 0x561e8739f651 <unknown>
#9 0x561e873a6fdc <unknown>
#10 0x561e8739f3d9 <unknown>
#11 0x561e873a214d <unknown>
#12 0x561e873a0516 <unknown>
#13 0x561e873a0a2a <unknown>
#14 0x561e8739f651 <unknown>
#15 0x561e87396ec2 <unknown>
#16 0x561e8739f3d9 <unknown>
#17 0x561e8739ec9c <unknown>
#18 0x561e8739e74b <unknown>
#19 0x561e873bcb82 <unknown>
#20 0x561e87390fcd <unknown>
#21 0x561e87390abf <unknown>
#22 0x561e87433ce7 <unknown>
#23 0x561e87427343 <unknown>
#24 0x561e873f8593 <unknown>
#25 0x561e873f8f5e <unknown>
#26 0x561e8768884b <unknown>
#27 0x561e8768c7a5 <unknown>
#28 0x561e87676571 <unknown>
#29 0x561e8768d332 <unknown>
#30 0x561e8765b87f <unknown>
#31 0x561e876b3728 <unknown>
#32 0x561e876b38fb <unknown>
#33 0x561e876c39b4 <unknown>
#34 0x7fb043f0b609 start_thread
