# Video Analysis Model
Put project desc here later

By: **Noog Troupers**

Members: enter later


## Load datasets

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline, AutoTokenizer, AutoConfig
from torch import nn
import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from data import Dataset as dataset

# Load video dataset
videos = dataset.getVideos()

# Load all comments (4.7 million comments use for actual training)
# comments = dataset.getAllComments()

# Load one comments dataset (~= 1 million comments each use for testing)
comments = dataset.getComments(dataset_id = 1)

In [2]:
# Used for writing agnostic code later on
device = "cuda" if torch.cuda.is_available() else "cpu"

In [26]:
nltk.download('punkt_tab')
nltk.download('stopwords')

class TextPreprocessor:
    @staticmethod
    def remove_stopwords(tokens):
        stop_words = set(stopwords.words('english'))
        return [word for word in tokens if word.lower() not in stop_words]

    @staticmethod
    def stem_tokens(tokens):
        stemmer = PorterStemmer()
        return [stemmer.stem(word) for word in tokens]

    @staticmethod
    def remove_punctuation(tokens):
        return [word for word in tokens if word not in string.punctuation]

    @staticmethod
    def remove_special_characters(text):
        return re.sub(r'[^a-zA-Z0-9\s]', '', text)

    @staticmethod
    def is_spam(text):
        text = str(text).lower()
        emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

        # If text is shorter than 5 characters without emojis consider it spam
        text = emoji_pattern.sub(r'', text)
        if len(text) < 5:
            return True

        spam_keywords = ['buy now', 'click here', 'subscribe', 'free', 'visit', 'winner', 'win', 'cash', 'prize']

        return int(any(keyword in text for keyword in spam_keywords))

    def clean_text(self, text):
        text = text.strip(' ')  # strip whitespaces
        text = text.lower()  # lowercase

        tokens = word_tokenize(text)
        filtered_tokens = self.remove_stopwords(tokens)
        stemmed_tokens = self.stem_tokens(filtered_tokens)
        punctuation_free_tokens = self.remove_punctuation(stemmed_tokens)
        cleaned_text = ' '.join(punctuation_free_tokens)
        cleaned_text = self.remove_special_characters(cleaned_text)


        return cleaned_text

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Preprocessing

In [3]:
videos.head()

Unnamed: 0,kind,videoId,publishedAt,channelId,title,description,tags,defaultLanguage,defaultAudioLanguage,contentDuration,viewCount,likeCount,favouriteCount,commentCount,topicCategories
0,youtube#video,85806,2024-01-15 00:59:29+00:00,33807,Unlocking the Benefits of Face Masks for Skin ...,,,en-US,en-US,PT9S,72.0,0.0,0.0,0.0,"['https://en.wikipedia.org/wiki/Health', 'http..."
1,youtube#video,30556,2023-10-27 19:32:16+00:00,46650,Get ready for the Magic💚💜🤍💝✨ #hydration #glowi...,,,,,PT45S,257.0,7.0,0.0,0.0,['https://en.wikipedia.org/wiki/Lifestyle_(soc...
2,youtube#video,51771,2024-09-28 01:23:22+00:00,14346,#trending #makeup #beautymakeup #yslbeauty #lu...,,,,en-US,PT19S,164.0,4.0,0.0,2.0,['https://en.wikipedia.org/wiki/Lifestyle_(soc...
3,youtube#video,45298,2023-07-13 15:19:28+00:00,50139,#shortvedio #balayage,,,,,PT14S,1207.0,20.0,0.0,0.0,['https://en.wikipedia.org/wiki/Lifestyle_(soc...
4,youtube#video,43611,2023-04-29 18:47:37+00:00,8143,Full Face of Merit Beauty 🤎 featuring new Flus...,,,,en,PT56S,8647.0,268.0,0.0,7.0,['https://en.wikipedia.org/wiki/Lifestyle_(soc...


In [39]:
# Remove duplicates
comments = comments.drop_duplicates(subset = ["commentId"])
videos = videos.drop_duplicates(subset = ["videoId"])

# Note down spam comments
comments["isSpam"] = comments["textOriginal"].apply(TextPreprocessor().is_spam)

# Remove spam comments
# comments = comments[comments["isSpam"] == 0]
# comments = comments.drop(columns = ["isSpam"])

# Drop rows with missing comment text
comments.dropna(inplace = True, subset = ["textOriginal"])

# Clean comment text (remove stopwords, punctuation, special characters, and lowercase)
comments["textCleaned"] = comments["textOriginal"].apply(TextPreprocessor().clean_text)

In [33]:
comments

Unnamed: 0,kind,commentId,channelId,videoId,authorId,textOriginal,parentCommentId,likeCount,publishedAt,updatedAt,cleanedText
1,youtube#comment,289571,14727,79618,3043229,Apply mashed potato juice and mixed it with curd,3198066.0,0,2023-10-02 13:08:22+00:00,2023-10-02 13:08:22+00:00,appli mash potato juic mix curd
18,youtube#comment,211328,17781,87279,2454363,"Oh, I'm so glad that you found this channel th...",3686870.0,1,2023-05-09 04:40:28+00:00,2023-05-09 04:40:28+00:00,oh m glad found channel thank much let know vi...
41,youtube#comment,102498,18073,42340,2575009,Love u❤️❤️,546184.0,1,2022-03-09 18:17:00+00:00,2022-03-09 18:17:00+00:00,love u
43,youtube#comment,87579,15536,82053,894013,You’re welcome 😊,3440022.0,0,2021-09-11 08:54:14+00:00,2021-09-11 08:54:14+00:00,welcom
47,youtube#comment,270594,19550,14405,1198778,Thank you! 😊,3477720.0,1,2023-08-17 06:47:46+00:00,2023-08-17 06:47:46+00:00,thank
...,...,...,...,...,...,...,...,...,...,...,...
999924,youtube#comment,356990,2269,10948,1636508,@user-rv6vp4vs5o Thank you I really appreciate...,4499703.0,1,2024-04-26 12:24:07+00:00,2024-04-26 12:24:07+00:00,userrv6vp4vs5o thank realli appreci
999943,youtube#comment,315253,11477,69509,693215,"​@@secretsbylavendereven I, a portuguese speak...",3452566.0,0,2023-12-18 22:30:32+00:00,2024-06-04 03:38:26+00:00,secretsbylavendereven portugues speaker ca nt...
999956,youtube#comment,274174,14429,69445,954619,You're a good guy with common sense! 👍🏻,888237.0,1,2023-08-31 12:26:33+00:00,2023-08-31 12:26:33+00:00,re good guy common sens
999978,youtube#comment,15670,23872,40538,1969353,Thank you ☺️,1035150.0,1,2020-08-12 08:54:20+00:00,2020-08-12 08:54:20+00:00,thank


In [37]:
analyzer = pipeline("sentiment-analysis", model="tabularisai/multilingual-sentiment-analysis")

Device set to use cuda:0


In [41]:
analyzer("Thank you!")

comments["sentiment"] = comments["textCleaned"].apply(lambda x: analyzer(x)[0]['label'])

Token indices sequence length is longer than the specified maximum sequence length for this model (647 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (647) must match the size of tensor b (512) at non-singleton dimension 1