# Video Analysis Model
Put project desc here later

By: **Noog Troupers**

Members: enter later


## Load datasets

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline, AutoTokenizer, AutoConfig
from torch import nn
import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [24]:
comment_links = [
    "https://storage.googleapis.com/dataset_hosting/comments1.csv",
    "https://storage.googleapis.com/dataset_hosting/comments2.csv",
    "https://storage.googleapis.com/dataset_hosting/comments3.csv",
    "https://storage.googleapis.com/dataset_hosting/comments4.csv",
    "https://storage.googleapis.com/dataset_hosting/comments5.csv",
]

video_link = "https://storage.googleapis.com/dataset_hosting/videos.csv"

class Dataset:
    @staticmethod
    def getAllComments():
        list_of_dfs = []
        for csv_file in comment_links:
            df = pd.read_csv(csv_file)
            list_of_dfs.append(df)

        return pd.concat(list_of_dfs, ignore_index=True)

    @staticmethod
    def getComments(dataset_id = 1):
        if dataset_id not in range(1, len(comment_links)):
            raise ValueError("dataset_id must be between 1 and 5")

        return pd.read_csv(comment_links[dataset_id - 1])

    @staticmethod
    def getVideos():
        return pd.read_csv(video_link)

dataset = Dataset()

In [None]:
# Load video dataset
videos = dataset.getVideos()

# Load all comments (4.7 million comments use for actual training)
# comments = dataset.getAllComments()

# Load one comments dataset (~= 1 million comments each use for testing)
comments = dataset.getComments(dataset_id = 1)
comments = comments.sample(frac = 0.1, random_state = 42) # Use 10% of the data for testing

In [2]:
nltk.download('punkt_tab')
nltk.download('stopwords')

class TextPreprocessor:
    @staticmethod
    def remove_stopwords(tokens):
        stop_words = set(stopwords.words('english'))
        return [word for word in tokens if word.lower() not in stop_words]

    @staticmethod
    def stem_tokens(tokens):
        stemmer = PorterStemmer()
        return [stemmer.stem(word) for word in tokens]

    @staticmethod
    def remove_punctuation(tokens):
        return [word for word in tokens if word not in string.punctuation]

    @staticmethod
    def remove_special_characters(text):
        return re.sub(r'[^a-zA-Z0-9\s]', '', text)

    @staticmethod
    def is_spam(text):
        text = str(text).lower()
        emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

        # If text is shorter than 5 characters without emojis consider it spam
        text = emoji_pattern.sub(r'', text)
        if len(text) < 5:
            return int(True)

        spam_keywords = ['buy now', 'click here', 'subscribe', 'free', 'visit', 'winner', 'win', 'cash', 'prize']

        return int(any(keyword in text for keyword in spam_keywords))

    def clean_text(self, text):
        text = text.strip(' ')
        text = text.lower()

        tokens = word_tokenize(text)
        filtered_tokens = self.remove_stopwords(tokens)
        stemmed_tokens = self.stem_tokens(filtered_tokens)
        punctuation_free_tokens = self.remove_punctuation(stemmed_tokens)
        cleaned_text = ' '.join(punctuation_free_tokens)
        cleaned_text = self.remove_special_characters(cleaned_text)


        return cleaned_text

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Preprocessing

In [4]:
comments.count()

kind               100000
commentId          100000
channelId          100000
videoId            100000
authorId           100000
textOriginal        99997
parentCommentId     11033
likeCount          100000
publishedAt        100000
updatedAt          100000
dtype: int64

In [5]:
# Remove duplicates
comments = comments.drop_duplicates(subset = ["commentId"])
videos = videos.drop_duplicates(subset = ["videoId"])

# Note down spam comments
comments["isSpam"] = comments["textOriginal"].apply(TextPreprocessor().is_spam)

# Remove spam comments
# comments = comments[comments["isSpam"] == 0]
# comments = comments.drop(columns = ["isSpam"])

# Drop rows with missing comment text
comments.dropna(inplace = True, subset = ["textOriginal"])

# Clean comment text (remove stopwords, punctuation, special characters, and lowercase)
comments["textCleaned"] = comments["textOriginal"].apply(TextPreprocessor().clean_text)

In [6]:
comments

Unnamed: 0,kind,commentId,channelId,videoId,authorId,textOriginal,parentCommentId,likeCount,publishedAt,updatedAt,isSpam,textCleaned
987231,youtube#comment,169483,45150,44543,773251,Thank you very much 🥰 Please share 🙏💞,4367972.0,0,2022-12-24 21:25:47+00:00,2022-12-24 21:25:47+00:00,0,thank much pleas share
79954,youtube#comment,2141511,14429,69445,1911102,She looks pretty on both sides. Only big diffe...,,0,2024-08-01 11:49:33+00:00,2024-08-01 11:49:33+00:00,0,look pretti side big differ eye see
567130,youtube#comment,2544738,31438,32409,1832205,I hate straight hair & love it. Glad you like it❤,,0,2023-09-14 17:01:03+00:00,2023-09-14 17:01:03+00:00,0,hate straight hair love glad like it
500891,youtube#comment,2873452,48537,6358,647305,The texture makes you look more beautiful and ...,,152,2024-10-01 17:27:48+00:00,2024-10-01 17:27:48+00:00,0,textur make look beauti aliv
55399,youtube#comment,4383408,14492,18248,525132,Handsome,,0,2023-06-07 03:29:55+00:00,2023-06-07 03:29:55+00:00,0,handsom
...,...,...,...,...,...,...,...,...,...,...,...,...
395942,youtube#comment,4183848,49533,52820,2736082,Eu confiei e no meu dia menos estranha eu pare...,,0,2024-08-26 11:11:24+00:00,2024-08-26 11:11:24+00:00,0,eu confiei e meu dia meno estranha eu pareo mu...
417771,youtube#comment,1314456,48953,29310,296154,I'love indonesia❤❤❤❤,,0,2023-11-03 03:50:54+00:00,2023-11-03 03:50:54+00:00,0,ilov indonesia
937140,youtube#comment,4260332,23924,11515,3264222,Is it the eyebrow? Hair colour or her eyes.. D...,,0,2023-10-05 21:14:24+00:00,2023-10-05 21:14:24+00:00,0,eyebrow hair colour eye nt want shame suddenl...
794022,youtube#comment,3468699,41338,78245,1295913,I LOVE my lanage it is so amazing,,0,2024-12-27 14:35:35+00:00,2024-12-27 14:35:35+00:00,0,love lanag amaz


In [19]:
# determine device for the pipeline (use GPU if available)
device_index = 0 if torch.cuda.is_available() else -1
# model = "distilbert-base-uncased-finetuned-sst-2-english"
model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
analyzer = pipeline(
    "sentiment-analysis",
    model=model,
    truncation=True,
    device=device_index,
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [21]:
# Faster batched inference (and avoid re-processing duplicate texts)
# 1) map NaNs to empty strings and ensure str dtype
texts_series = comments["textCleaned"].fillna("").astype(str)

# 2) run inference only on unique texts to reduce duplicate work
unique_texts = list(pd.Series(texts_series.unique()))

# 3) infer labels in batches and build a mapping
batch_size = 64  # tune this based on available memory/GPU
label_map = {}
for i in range(0, len(unique_texts), batch_size):
    batch = unique_texts[i:i + batch_size]
    out = analyzer(batch, truncation=True, max_length = 512, batch_size=len(batch))
    for text, res in zip(batch, out):
        label_map[text] = res["label"]

# 4) map back to the dataframe
comments["sentiment"] = texts_series.map(label_map)

In [22]:
comments[["textCleaned", "sentiment"]]

Unnamed: 0,textCleaned,sentiment
987231,thank much pleas share,positive
79954,look pretti side big differ eye see,neutral
567130,hate straight hair love glad like it,positive
500891,textur make look beauti aliv,positive
55399,handsom,positive
...,...,...
395942,eu confiei e meu dia meno estranha eu pareo mu...,neutral
417771,ilov indonesia,neutral
937140,eyebrow hair colour eye nt want shame suddenl...,neutral
794022,love lanag amaz,positive
