In [2]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
from textblob import TextBlob
from api_call import get_comments_for_videos
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
from transformers import BertTokenizer



## Reading in API Data

In [3]:
# Step 1: Get data from API
videos = ['https://www.youtube.com/watch?v=7MAJfcG8B7E', 'https://www.youtube.com/watch?v=p1Ni5ZuOVZ4', 
          'https://www.youtube.com/watch?v=TW__W5AGKEQ', 'https://www.youtube.com/watch?v=J5FLyHMV9og',
          'https://www.youtube.com/watch?v=gQscaDIRaMQ','https://www.youtube.com/watch?v=p7PPpw55SZI',
          'https://www.youtube.com/watch?v=FiY2RY55YTg', 'https://www.youtube.com/watch?v=92XoU9cSYdM']


df = get_comments_for_videos(videos)  # Check if this step works

df.head()

Unnamed: 0,video_id,author,comment,published_at
0,7MAJfcG8B7E,@SmartMoneywithKai,⚡ ALL MY FAVOURITE TOOLS\nhttps://smartmoneywi...,2024-07-18T16:22:20Z
1,7MAJfcG8B7E,@Bogdan7a,I can clearly see that Revolut paid you. lol,2024-11-10T21:22:26Z
2,7MAJfcG8B7E,@isjeboyveghel6665,what is the value of 10K points in euros ?,2024-11-05T11:23:12Z
3,7MAJfcG8B7E,@yuvraj1566,"Using the ultra plan, if you pay via your Revo...",2024-11-03T20:39:20Z
4,7MAJfcG8B7E,@bakierol5315,spare change is scam! i have spent 130£ witho...,2024-10-30T02:41:59Z


In [4]:
#Convert the published_at to date 
df['published_at']= df['published_at'].astype('datetime64[ns]')

In [5]:
df['month'] = df['published_at'].dt.month
df['year'] = df['published_at'].dt.year

In [6]:

def clean_data(df):
    #columns 
    df = df.drop(columns=['author'])

    df = df.dropna(subset=['comment'])  # Drop rows where 'comment' column is NaN

    #lowercase
    df['comment'] = df['comment'].str.lower()

    #remove links and generic comments about the video
    df = df[~df.comment.str.contains("https")]
    generic_comments  = r'(good video|great video|nice video|awesome video|amazing video|cool video|interesting video)' 
    df = df[~df.comment.str.contains(generic_comments)]


    return df


df = clean_data(df)

df.head()

  df = df[~df.comment.str.contains(generic_comments)]


Unnamed: 0,video_id,comment,published_at,month,year
1,7MAJfcG8B7E,i can clearly see that revolut paid you. lol,2024-11-10 21:22:26,11,2024
2,7MAJfcG8B7E,what is the value of 10k points in euros ?,2024-11-05 11:23:12,11,2024
3,7MAJfcG8B7E,"using the ultra plan, if you pay via your revo...",2024-11-03 20:39:20,11,2024
4,7MAJfcG8B7E,spare change is scam! i have spent 130£ witho...,2024-10-30 02:41:59,10,2024
5,7MAJfcG8B7E,i just noticed i spent 280 euros in the last 0...,2024-10-30 00:34:43,10,2024


## Basic EDA on the comments

In [7]:
print('Total Comments:', len(df['comment']))
print('Unique Videos:', len(df['video_id'].unique()))

Total Comments: 412
Unique Videos: 8


In [8]:
print('Comments per video id: ', 
      df['video_id'].value_counts())

Comments per video id:  92XoU9cSYdM    130
TW__W5AGKEQ     65
p7PPpw55SZI     60
J5FLyHMV9og     56
gQscaDIRaMQ     46
7MAJfcG8B7E     23
p1Ni5ZuOVZ4     18
FiY2RY55YTg     14
Name: video_id, dtype: int64


## Sentiment Analysis

First using NLTK

In [9]:
polarity = []
for i in df['comment']:
    blob = TextBlob(i)
    polarity.append(round(blob.sentiment.polarity,3))
df['polarity'] = polarity
print('Polarity Column added to the dataframe')

Polarity Column added to the dataframe


In [10]:
sentiment = []
for i in df['polarity']:
    if i > 0:
        sentiment.append('positive')
    elif i < 0:
        sentiment.append('negative')
    else:
        sentiment.append('neutral')

df['NLP sentiment'] = sentiment

In [11]:
df.head()

Unnamed: 0,video_id,comment,published_at,month,year,polarity,NLP sentiment
1,7MAJfcG8B7E,i can clearly see that revolut paid you. lol,2024-11-10 21:22:26,11,2024,0.45,positive
2,7MAJfcG8B7E,what is the value of 10k points in euros ?,2024-11-05 11:23:12,11,2024,0.0,neutral
3,7MAJfcG8B7E,"using the ultra plan, if you pay via your revo...",2024-11-03 20:39:20,11,2024,0.0,neutral
4,7MAJfcG8B7E,spare change is scam! i have spent 130£ witho...,2024-10-30 02:41:59,10,2024,0.1,positive
5,7MAJfcG8B7E,i just noticed i spent 280 euros in the last 0...,2024-10-30 00:34:43,10,2024,-0.135,negative


In [12]:
print('Reviews with Positive Sentiment based on Polarity :', len(df[df['polarity'] > 0]))
print('Reviews with Negative Sentiment based on Polarity :', len(df[df['polarity'] < 0]))
print('Reviews with Neutral Sentiment based on Polarity :', len(df[df['polarity'] == 0]))

Reviews with Positive Sentiment based on Polarity : 183
Reviews with Negative Sentiment based on Polarity : 66
Reviews with Neutral Sentiment based on Polarity : 163


: 

Now using LLMs(Transformers)

### 1. Twitter roBERTa 
This is a roBERTa-base model trained on ~124M tweets and finetuned for sentiment analysis with the TweetEval benchmark. 

In [13]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from scipy.special import softmax
import numpy as np
import pandas as pd

MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# No preprocessing needed
def get_sentiment(text):
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=512)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    
    top_label = config.id2label[ranking[0]]
    top_score = np.round(float(scores[ranking[0]]), 4)
    
    return top_label, top_score


# Apply the sentiment analysis to each comment
df['LLM sentiment'], df['LLM score'] = zip(*df['comment'].map(get_sentiment))

print('Sentiment Analysis using LLM completed')


2024-11-15 18:24:51.764739: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequ

In [None]:
print('Reviews with Positive Sentiment based on roBERTa :', len(df[df['sentiment'] == 'positive']))
print('Reviews with Negative Sentiment based on roBERTa :', len(df[df['sentiment'] == 'negative']))
print('Reviews with Neutral Sentiment based on roBERTa :', len(df[df['sentiment'] == 'neutral']))


Reviews with Positive Sentiment based on roBERTa : 65
Reviews with Negative Sentiment based on roBERTa : 149
Reviews with Neutral Sentiment based on roBERTa : 198


In [None]:
df.head()