# Sentiment Analysis

## Import the Packages

In [1]:
#import the required packages

import networkx as nx
import pandas as pd
import numpy as np 
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer


[nltk_data] Downloading package punkt to
[nltk_data]     /home/srivatsanms_2022/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/srivatsanms_2022/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/srivatsanms_2022/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Preparation

In [3]:
#load the data and drop the unnamed column

df_dialogue = pd.read_excel(r'cleaned_data.xlsx', sheet_name='dialogue')
df_dialogue.drop(columns="Unnamed: 0",inplace=True)
df_dialogue.head()

Unnamed: 0,Movie ID,Node_X,Node_Y,Dialogue,Node_pair
0,1,Albus Dumbledore,Minerva McGonagall,I should have known that you would be here...P...,"Albus Dumbledore,Minerva McGonagall"
1,1,Minerva McGonagall,Albus Dumbledore,"Good evening, Professor Dumbledore. Are the ru...","Minerva McGonagall,Albus Dumbledore"
2,1,Albus Dumbledore,Minerva McGonagall,"I'm afraid so, Professor. The good, and the bad.","Albus Dumbledore,Minerva McGonagall"
3,1,Minerva McGonagall,Albus Dumbledore,And the boy?,"Minerva McGonagall,Albus Dumbledore"
4,1,Albus Dumbledore,Minerva McGonagall,Hagrid is bringing him.,"Albus Dumbledore,Minerva McGonagall"


In [4]:
#Initialize Voldemort Aliases Reference Dictionary

dictionary = {"He-Who-Must-Not-Be-Named":"Voldemort","You-Know-Who":"Voldemort","Voldemort":"Voldemort","Tom":"Voldemort","Riddle":"Voldemort","Dark Lord":"Voldemort","My Lord":"Voldemort"}

In [5]:
#initializing the corpus

corpus_raw=[]
for line in df_dialogue["Dialogue"]:
        corpus_raw.append(line)
        
print(corpus_raw[0])

I should have known that you would be here...Professor McGonagall.


In [6]:
#converting the corpus into lowercase

corpus = [doc.lower() for doc in corpus_raw]
print(corpus[0])

i should have known that you would be here...professor mcgonagall.


In [7]:
#tokenization

corpus = [word_tokenize(doc) for doc in corpus]
print(corpus[0])

['i', 'should', 'have', 'known', 'that', 'you', 'would', 'be', 'here', '...', 'professor', 'mcgonagall', '.']


In [8]:
# to check if a token is valid, remove punctuation

def is_valid_token(token):
    return token[0].isalpha()


corpus = [[token for token in doc if is_valid_token(token)] for doc in corpus]
print(corpus[0])

['i', 'should', 'have', 'known', 'that', 'you', 'would', 'be', 'here', 'professor', 'mcgonagall']


In [9]:
# extract stop words

stop_set = set(stopwords.words('english'))

# remove stop words

corpus = [[token for token in doc if token not in stop_set] for doc in corpus]
print(corpus[0])

['known', 'would', 'professor', 'mcgonagall']


In [10]:
#lemmatization

nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
corpus = [[lemmatizer.lemmatize(token) for token in doc] for doc in corpus]

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/srivatsanms_2022/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
#print the corpus to corss-verify

print(corpus[16])

['good', 'luck', 'harry', 'potter']


In [12]:
#Function defined to identify Node Z (Voldemort being referred to as the third person)

def identify_node_z(corpus):
    node_z = ""
    for name in dictionary:
        if name.lower() in corpus:
            node_z = dictionary[name]
            break
        else:
            continue
            
    return node_z

In [13]:
#Introduce and Update a new column "Node_Z" in the data frame

df_dialogue["Node_Z"] = [identify_node_z(i) for i in corpus]

In [14]:
#Cross-verify the Node Z results

df_dialogue.iloc[150]

Movie ID                                                     1
Node_X                                      Garrick Ollivander
Node_Y                                            Harry Potter
Dialogue     Oh, we do not speak his name. The wand chooses...
Node_pair                      Garrick Ollivander,Harry Potter
Node_Z                                               Voldemort
Name: 150, dtype: object

## Sentiment Analysis - VADER

In [15]:
# loading and using the compound score of sentiment model

def compute_sentiment(corpus):
    model = SentimentIntensityAnalyzer()
    ps = model.polarity_scores(corpus)

    return ps['compound']

In [16]:
#Download the VADER lexicon and Introduce a column "Sentiment_Score" to update the Compound Sentiment Score

nltk.download('vader_lexicon')
df_dialogue["Sentiment Score"] = [compute_sentiment(i) for i in df_dialogue["Dialogue"]]

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/srivatsanms_2022/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [17]:
df_dialogue.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6571 entries, 0 to 6570
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Movie ID         6571 non-null   int64  
 1   Node_X           6571 non-null   object 
 2   Node_Y           6571 non-null   object 
 3   Dialogue         6571 non-null   object 
 4   Node_pair        6571 non-null   object 
 5   Node_Z           6571 non-null   object 
 6   Sentiment Score  6571 non-null   float64
dtypes: float64(1), int64(1), object(5)
memory usage: 359.5+ KB


In [19]:
#Identify the supporters of Lord Voldemort, compound sentiment score values more than 0

supporters= []
for index,row in df_dialogue.iterrows():
    if row["Sentiment Score"]>0 and (row["Node_Z"] == "Voldemort" or row["Node_Y"] == "Voldemort" or row["Node_Y"] == "Tom Riddle"):
        supporters.append(row["Node_X"])

In [20]:
print(set(supporters))

{'Albus Dumbledore', 'Arthur Weasley', 'Lucius Malfoy', 'Cornelius Fudge', 'Narcissa Malfoy', 'Xenophilius Lovegood', 'Fred Weasley', 'Severus Snape', 'Tom Riddle', 'Bellatrix Lestrange', 'Harry Potter', 'Barty Crouch Jr.', 'Voldemort', 'Stanley Shunpike', 'Remus Lupin', 'Cedric Diggory', 'Dobby', 'Moaning Myrtle', 'Garrick Ollivander', 'Pius Thicknesse', 'Neville Longbottom', 'Elphias Doge', 'Gellert Grindelwald', 'Horace Slughorn', 'Rubeus Hagrid', 'Peter Pettigrew', 'Sirius Black', 'Ginny Weasley', 'Hermione Granger'}


In [21]:
#Identify the enemies of Lord Voldemort, compound sentiment score values less than 0

enemies= []
for index,row in df_dialogue.iterrows():
    if row["Sentiment Score"]<0 and (row["Node_Z"] == "Voldemort" or row["Node_Y"] == "Voldemort" or row["Node_Y"] == "Tom Riddle"):
        enemies.append(row["Node_X"])
        
print(set(enemies))

{'Albus Dumbledore', 'Arthur Weasley', 'Lucius Malfoy', 'Corban Yaxley', 'Cornelius Fudge', 'Fred Weasley', 'Severus Snape', 'Tom Riddle', 'Bellatrix Lestrange', 'Harry Potter', 'Mykew Gregorovitch', 'Mrs. Cole', 'Voldemort', 'Remus Lupin', 'George Weasley', 'Minerva McGonagall', 'Gellert Grindelwald', 'Horace Slughorn', 'Rubeus Hagrid', 'Peter Pettigrew', 'Ginny Weasley', 'Sirius Black', 'Ron Weasley', 'Quirinus Quirrell', 'Hermione Granger'}


## Sentiment Analysis - BERT

In [22]:
# !pip install transformers
import transformers
import tensorflow as tf
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


  from .autonotebook import tqdm as notebook_tqdm
2023-03-24 17:20:33.092836: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-24 17:20:37.424390: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-24 17:20:37.433327: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
All model checkpoint layers

In [23]:
#loading the imdb dataset from the url for training the BERT model

URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

In [24]:
# The shutil module offers a number of high-level 
# operations on files and collections of files.
import os
import shutil
# Create main directory path ("/aclImdb")
main_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
# Create sub directory path ("/aclImdb/train")
train_dir = os.path.join(main_dir, 'train')
# Remove unsup folder since this is a supervised learning task
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)
# View the final train folder
print(os.listdir(train_dir))

['labeledBow.feat', 'pos', 'unsupBow.feat', 'urls_pos.txt', 'urls_unsup.txt', 'neg', 'urls_neg.txt']


In [25]:
# We create a training dataset and a validation 
# dataset from our "aclImdb/train" directory with a 80/20 split.
train = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='training', seed=123)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='validation', seed=123)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [26]:
for i in train.take(1):
  train_feat = i[0].numpy()
  train_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")
train.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,Canadian director Vincenzo Natali took the art...,1
1,I gave this film 10 not because it is a superb...,1
2,I admit to being somewhat jaded about the movi...,1
3,"For a long time, 'The Menagerie' was my favori...",1
4,A truly frightening film. Feels as if it were ...,0


In [27]:
for j in test.take(1):
  test_feat = j[0].numpy()
  test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")
test.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,I can't believe that so much talent can be was...,0
1,This movie blows - let's get that straight rig...,0
2,"The saddest thing about this ""tribute"" is that...",0
3,I'm only rating this film as a 3 out of pity b...,0
4,Something surprised me about this movie - it w...,1


In [28]:
InputExample(guid=None,
             text_a = "Hello, world",
             text_b = None,
             label = 1)

InputExample(guid=None, text_a='Hello, world', text_b=None, label=1)

In [29]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           'DATA_COLUMN', 
                                                                           'LABEL_COLUMN')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [30]:
#Split the data into train and validation data

train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



In [31]:
#Train the model

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7feca4611460>

In [32]:
#Intialize a few sample sentences to test the model

pred_sentences = ['This was an awesome movie. I watch it twice my time watching this beautiful movie if I have known it was this good',
                  'One of the worst movies of all time. I cannot believe I wasted two hours of my life for this movie','This was a pretty bad movie',"I want everything to be perfect for my Dudley's special day.","But that's Dudley's old uniform. It'll fit me like bits of old Elephant skin."]

In [33]:
len(''.join(pred_sentences))

376

In [34]:
#Test the model with the defined sample sentences

tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])

This was an awesome movie. I watch it twice my time watching this beautiful movie if I have known it was this good : 
 Positive
One of the worst movies of all time. I cannot believe I wasted two hours of my life for this movie : 
 Negative
This was a pretty bad movie : 
 Negative
I want everything to be perfect for my Dudley's special day. : 
 Positive
But that's Dudley's old uniform. It'll fit me like bits of old Elephant skin. : 
 Negative


In [35]:
dialogs = list(df_dialogue["Dialogue"])
len(dialogs)

6571

In [36]:
# Run the sentiment analysis model on the actual movie script dataset in batches and retrieve the negative and postive scores
labels = ['Negative','Positive']
batch_size = 1000
index = 0
sentiment_label = []
while index < len(dialogs):
  print(index)
  tf_batch = tokenizer(dialogs[index:(index+batch_size)], max_length=128, padding=True, truncation=True, return_tensors='tf')
  tf_outputs = model(tf_batch)
  tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
  label = tf.argmax(tf_predictions, axis=1)
  neg_lt = []
  pos_lt = []
  for pred in tf_predictions:
    neg_lt.append(pred[0])
    pos_lt.append(pred[1])
    
  neg_score = np.array(neg_lt)
  pos_score = np.array(pos_lt)
  
  label = label.numpy()
  if index == 0:
    sentiment_label = label
    sentiment_score_pos = pos_score
    sentiment_score_neg = neg_score
  else:
    sentiment_label = np.append(sentiment_label,label)
    sentiment_score_pos = np.append(sentiment_score_pos,pos_score)
    sentiment_score_neg = np.append(sentiment_score_neg,neg_score)
  index = index + batch_size
# for i in range(len(dialogs)):
#   print(dialogs[i], ": \n", labels[label[i]])

0
1000
2000
3000
4000
5000
6000


In [37]:
sentiment_score_pos

array([0.99513453, 0.9338689 , 0.8971875 , ..., 0.7088668 , 0.7925481 ,
       0.98509896], dtype=float32)

In [38]:
sentiment_score_neg

array([0.00486546, 0.06613106, 0.10281249, ..., 0.29113328, 0.2074519 ,
       0.01490109], dtype=float32)

In [39]:
##Introduce 3 new columns to save the sentiment score values

df_dialogue["Bert Sentiment Score"] = [labels[sentiment_label[i]] for i in range(len(dialogs))]
df_dialogue['Bert Sentiment Pos'] = [sentiment_score_pos[i] for i in range(len(dialogs))]
df_dialogue['Bert Sentiment Neg'] = [sentiment_score_neg[i] for i in range(len(dialogs))]

In [40]:
#Identify the supporters of Lord Voldemort by directly fetching the scores with the label Positive
supporters= []
for index,row in df_dialogue.iterrows():
    if row["Bert Sentiment Score"]=="Positive" and (row["Node_Y"] == "Voldemort" or row["Node_Z"]=="Voldemort"):
        supporters.append(row["Node_X"])

print(set(supporters))

{'Albus Dumbledore', 'Arthur Weasley', 'Lucius Malfoy', 'Corban Yaxley', 'Cornelius Fudge', 'Narcissa Malfoy', 'Xenophilius Lovegood', 'Fred Weasley', 'Diary', 'Severus Snape', 'Pansy Parkinson', 'Tom Riddle', 'Bellatrix Lestrange', 'Harry Potter', 'Mykew Gregorovitch', 'Barty Crouch Jr.', 'Voldemort', 'Mrs. Cole', 'Remus Lupin', 'George Weasley', 'Cedric Diggory', 'Dobby', 'Minerva McGonagall', 'Moaning Myrtle', 'Garrick Ollivander', 'Percy Weasley', 'Pius Thicknesse', 'Neville Longbottom', 'Elphias Doge', 'Gellert Grindelwald', 'Horace Slughorn', 'Filius Flitwick', 'Rubeus Hagrid', 'Peter Pettigrew', 'Ginny Weasley', 'Sirius Black', 'Ron Weasley', 'Quirinus Quirrell', 'Hermione Granger'}


In [41]:
#Identify the enemies of Lord Voldemort by directly fetching the scores with the label Negative
enemies= []
for index,row in df_dialogue.iterrows():
    if row["Bert Sentiment Score"]=="Negative" and (row["Node_Z"] == "Voldemort" or row["Node_Y"] == "Voldemort" or row["Node_Y"] == "Tom Riddle"):
        enemies.append(row["Node_X"])
        
print(set(enemies))

{'Albus Dumbledore', 'Bellatrix Lestrange', 'Rubeus Hagrid', 'Harry Potter', 'Stanley Shunpike', 'Sirius Black', 'Ron Weasley', 'Hermione Granger', 'Severus Snape', 'Horace Slughorn'}


### Analyzing to improve the performance of the model

In [42]:
#Checking the values for the character Remus Lupin in Node_X referring to Lord Voldemort as third person in the dialogs

df_dialogue[(df_dialogue['Node_X']=='Remus Lupin') & (df_dialogue['Node_Z']=='Voldemort')]

Unnamed: 0,Movie ID,Node_X,Node_Y,Dialogue,Node_pair,Node_Z,Sentiment Score,Bert Sentiment Score,Bert Sentiment Pos,Bert Sentiment Neg
1883,3,Remus Lupin,Harry Potter,You want to know why I stopped you facing that...,"Remus Lupin,Harry Potter",Voldemort,0.2732,Positive,0.944806,0.055194
2286,3,Remus Lupin,Peter Pettigrew,"You sold James and Lily to Voldemort, didn't you?","Remus Lupin,Peter Pettigrew",Voldemort,0.0,Positive,0.984887,0.015113
2291,3,Remus Lupin,Peter Pettigrew,"You should've realized, Peter, that if Voldemo...","Remus Lupin,Peter Pettigrew",Voldemort,0.6511,Positive,0.996573,0.003427
3287,5,Remus Lupin,Harry Potter,Fudge likes being Minister of Magic. If it tur...,"Remus Lupin,Harry Potter",Voldemort,0.4215,Positive,0.98528,0.01472
4713,6,Remus Lupin,Harry Potter,Voldemort has chosen Draco Malfoy for a mission?,"Remus Lupin,Harry Potter",Voldemort,0.0,Positive,0.972879,0.027121
5255,7,Remus Lupin,Harry Potter,We've been betrayed. Voldemort knew you were ...,"Remus Lupin,Harry Potter",Voldemort,-0.4019,Positive,0.994419,0.005581


In [43]:
#Introducing a new column to save the difference between the positive and negative Bert Score

df_dialogue['Bert_Sentiment_Score_Diff'] = df_dialogue['Bert Sentiment Pos'] - df_dialogue['Bert Sentiment Neg']

In [44]:
df_dialogue[(df_dialogue['Node_X']=='Remus Lupin') & (df_dialogue['Node_Z']=='Voldemort')]

Unnamed: 0,Movie ID,Node_X,Node_Y,Dialogue,Node_pair,Node_Z,Sentiment Score,Bert Sentiment Score,Bert Sentiment Pos,Bert Sentiment Neg,Bert_Sentiment_Score_Diff
1883,3,Remus Lupin,Harry Potter,You want to know why I stopped you facing that...,"Remus Lupin,Harry Potter",Voldemort,0.2732,Positive,0.944806,0.055194,0.889611
2286,3,Remus Lupin,Peter Pettigrew,"You sold James and Lily to Voldemort, didn't you?","Remus Lupin,Peter Pettigrew",Voldemort,0.0,Positive,0.984887,0.015113,0.969774
2291,3,Remus Lupin,Peter Pettigrew,"You should've realized, Peter, that if Voldemo...","Remus Lupin,Peter Pettigrew",Voldemort,0.6511,Positive,0.996573,0.003427,0.993146
3287,5,Remus Lupin,Harry Potter,Fudge likes being Minister of Magic. If it tur...,"Remus Lupin,Harry Potter",Voldemort,0.4215,Positive,0.98528,0.01472,0.97056
4713,6,Remus Lupin,Harry Potter,Voldemort has chosen Draco Malfoy for a mission?,"Remus Lupin,Harry Potter",Voldemort,0.0,Positive,0.972879,0.027121,0.945758
5255,7,Remus Lupin,Harry Potter,We've been betrayed. Voldemort knew you were ...,"Remus Lupin,Harry Potter",Voldemort,-0.4019,Positive,0.994419,0.005581,0.988838


In [45]:
df_dialogue[(df_dialogue['Node_X']=='Aberforth Dumbledore') & (df_dialogue['Node_Y']=='Voldemort')]

Unnamed: 0,Movie ID,Node_X,Node_Y,Dialogue,Node_pair,Node_Z,Sentiment Score,Bert Sentiment Score,Bert Sentiment Pos,Bert Sentiment Neg,Bert_Sentiment_Score_Diff


In [46]:
df_dialog_voldemort = df_dialogue[((df_dialogue["Node_Y"] == "Voldemort") | (df_dialogue["Node_Z"]=="Voldemort"))&(df_dialogue['Node_X']!='Voldemort')]
df_dialog_voldemort[df_dialog_voldemort['Node_X']=='Voldemort']

Unnamed: 0,Movie ID,Node_X,Node_Y,Dialogue,Node_pair,Node_Z,Sentiment Score,Bert Sentiment Score,Bert Sentiment Pos,Bert Sentiment Neg,Bert_Sentiment_Score_Diff


In [47]:
#Grouping the values by the Node_X characters' names and computing the mean of the difference in the BERT sentiment score

df_sentiment_avg = df_dialog_voldemort.groupby('Node_X')['Bert_Sentiment_Score_Diff'].mean()
df_sentiment_avg = df_sentiment_avg.reset_index(name = 'Bert_Score_Avg')
df_sentiment_avg

Unnamed: 0,Node_X,Bert_Score_Avg
0,Albus Dumbledore,0.874191
1,Arthur Weasley,0.855574
2,Barty Crouch Jr.,0.992007
3,Bellatrix Lestrange,0.608674
4,Cedric Diggory,0.39706
5,Corban Yaxley,0.936486
6,Cornelius Fudge,0.954566
7,Diary,0.992174
8,Dobby,0.485234
9,Elphias Doge,0.997288


In [48]:
#Fetching those ids where the above values are lsited 
df_sentiment_diff = df_dialog_voldemort.loc[df_dialog_voldemort.groupby('Node_X')['Bert_Sentiment_Score_Diff'].idxmin()]
df_sentiment_diff

Unnamed: 0,Movie ID,Node_X,Node_Y,Dialogue,Node_pair,Node_Z,Sentiment Score,Bert Sentiment Score,Bert Sentiment Pos,Bert Sentiment Neg,Bert_Sentiment_Score_Diff
5109,6,Albus Dumbledore,Draco Malfoy,"Like cursing Katie Bell and hoping she would, ...","Albus Dumbledore,Draco Malfoy",Voldemort,-0.8845,Negative,0.144803,0.855197,-0.710394
4735,6,Arthur Weasley,Harry Potter,They were all the rage when Voldemort first ro...,"Arthur Weasley,Harry Potter",Voldemort,-0.3716,Positive,0.856564,0.143436,0.713129
2497,4,Barty Crouch Jr.,Voldemort,I will not disappoint you my lord.,"Barty Crouch Jr.,Voldemort",,0.3089,Positive,0.994972,0.005028,0.989944
5195,7,Bellatrix Lestrange,Voldemort,"My lord, if I might, I'd like to volunteer mys...","Bellatrix Lestrange,Voldemort",,-0.1779,Negative,0.455644,0.544356,-0.088713
3077,4,Cedric Diggory,Voldemort,Who are you? What do you want?,"Cedric Diggory,Voldemort",,0.168,Positive,0.69853,0.30147,0.39706
6448,8,Corban Yaxley,Voldemort,"No sign of him, my Lord.","Corban Yaxley,Voldemort",,-0.296,Positive,0.954464,0.045536,0.908928
1687,3,Cornelius Fudge,Harry Potter,"Nothing, of course. You're safe. And that's wh...","Cornelius Fudge,Harry Potter",Voldemort,0.4588,Positive,0.971212,0.028788,0.942424
1296,2,Diary,Harry Potter,"Hello Harry Potter, my name is Tom Riddle.","Diary,Harry Potter",Voldemort,0.0,Positive,0.996087,0.003913,0.992174
1143,2,Dobby,Harry Potter,"Not kill you, sir, never kill you! Dobby remem...","Dobby,Harry Potter",Voldemort,0.9456,Positive,0.742617,0.257383,0.485234
5336,7,Elphias Doge,Harry Potter,"Well, he treasured you, Mr. Potter, I can atte...","Elphias Doge,Harry Potter",Voldemort,0.4019,Positive,0.998644,0.001356,0.997288


In [49]:
# test_df[(test_df["Node_Y"] == "Voldemort") | (test_df["Node_Z"]=="Voldemort")]

In [50]:
#Identify the supporters of Lord Voldemort by setting a threshold value as 0.85 after analyzing the results

supporters= []
for index,row in df_sentiment_avg.iterrows():
    if row["Bert_Score_Avg"]>0.85:
        supporters.append(row["Node_X"])

print(supporters)

['Albus Dumbledore', 'Arthur Weasley', 'Barty Crouch Jr.', 'Corban Yaxley', 'Cornelius Fudge', 'Diary', 'Elphias Doge', 'Fred Weasley', 'Garrick Ollivander', 'Gellert Grindelwald', 'Ginny Weasley', 'Lucius Malfoy', 'Minerva McGonagall', 'Moaning Myrtle', 'Mrs. Cole', 'Mykew Gregorovitch', 'Narcissa Malfoy', 'Neville Longbottom', 'Pansy Parkinson', 'Percy Weasley', 'Peter Pettigrew', 'Pius Thicknesse', 'Quirinus Quirrell', 'Remus Lupin', 'Tom Riddle', 'Xenophilius Lovegood']


## Final Results of both VADER and BERT (Supporters & Enemies of Voldemort)

### BERT results

In [51]:
#Fetching the top 10 enemies of Lord Voldemort

df_sentiment_avg.sort_values('Bert_Score_Avg')[:10]

Unnamed: 0,Node_X,Bert_Score_Avg
36,Stanley Shunpike,-0.471326
32,Ron Weasley,-0.189143
10,Filius Flitwick,0.254898
4,Cedric Diggory,0.39706
8,Dobby,0.485234
3,Bellatrix Lestrange,0.608674
35,Sirius Black,0.693309
33,Rubeus Hagrid,0.700187
34,Severus Snape,0.734973
14,George Weasley,0.752459


In [52]:
#Fetching the top 10 supporters of Voldemort

df_sentiment_avg.sort_values('Bert_Score_Avg',ascending=False)[:10]

Unnamed: 0,Node_X,Bert_Score_Avg
23,Mykew Gregorovitch,0.997941
24,Narcissa Malfoy,0.997432
9,Elphias Doge,0.997288
20,Minerva McGonagall,0.995756
38,Xenophilius Lovegood,0.995285
21,Moaning Myrtle,0.994086
7,Diary,0.992174
2,Barty Crouch Jr.,0.992007
13,Gellert Grindelwald,0.991509
37,Tom Riddle,0.989882


In [53]:
#Identify the supporters of Lord Voldemort
enemies= []
for index,row in df_sentiment_avg.iterrows():
    if row["Bert_Score_Avg"]<0.85:
        enemies.append(row["Node_X"])

print(enemies)

['Bellatrix Lestrange', 'Cedric Diggory', 'Dobby', 'Filius Flitwick', 'George Weasley', 'Harry Potter', 'Hermione Granger', 'Horace Slughorn', 'Ron Weasley', 'Rubeus Hagrid', 'Severus Snape', 'Sirius Black', 'Stanley Shunpike']


### VADER results

In [54]:
#Similar to the computed mean score of BERT, the mean values are computed for the VADER results as well

df_Lsentiment_avg = df_dialog_voldemort.groupby('Node_X')['Sentiment Score'].mean()
df_Lsentiment_avg = df_Lsentiment_avg.reset_index(name = 'Sentiment_Score_Avg')

#Top 10 enemies of Voldemort from VADER
df_Lsentiment_avg.sort_values('Sentiment_Score_Avg')[:10]

Unnamed: 0,Node_X,Sentiment_Score_Avg
32,Ron Weasley,-0.436929
14,George Weasley,-0.296
33,Rubeus Hagrid,-0.293475
22,Mrs. Cole,-0.2802
15,Ginny Weasley,-0.2193
30,Quirinus Quirrell,-0.20115
5,Corban Yaxley,-0.148
23,Mykew Gregorovitch,-0.126
20,Minerva McGonagall,-0.1007
0,Albus Dumbledore,-0.092658


In [55]:
#Top 10 supporters of Voldemort from VADER

df_Lsentiment_avg.sort_values('Sentiment_Score_Avg',ascending=False)[:10]

Unnamed: 0,Node_X,Sentiment_Score_Avg
12,Garrick Ollivander,0.9644
8,Dobby,0.9456
25,Neville Longbottom,0.856
36,Stanley Shunpike,0.4939
21,Moaning Myrtle,0.4927
24,Narcissa Malfoy,0.4588
9,Elphias Doge,0.4019
38,Xenophilius Lovegood,0.2382
18,Horace Slughorn,0.201917
3,Bellatrix Lestrange,0.1874


In [56]:
#Identify the supporters of Lord Voldemort
supporters= []
for index,row in df_Lsentiment_avg.iterrows():
    if row["Sentiment_Score_Avg"]>0:
        supporters.append(row["Node_X"])

print(supporters)

['Arthur Weasley', 'Barty Crouch Jr.', 'Bellatrix Lestrange', 'Cedric Diggory', 'Cornelius Fudge', 'Dobby', 'Elphias Doge', 'Fred Weasley', 'Garrick Ollivander', 'Gellert Grindelwald', 'Horace Slughorn', 'Lucius Malfoy', 'Moaning Myrtle', 'Narcissa Malfoy', 'Neville Longbottom', 'Peter Pettigrew', 'Pius Thicknesse', 'Remus Lupin', 'Severus Snape', 'Stanley Shunpike', 'Xenophilius Lovegood']


In [57]:
#Identify the supporters of Lord Voldemort
enemies= []
for index,row in df_Lsentiment_avg.iterrows():
    if row["Sentiment_Score_Avg"]<0:
        enemies.append(row["Node_X"])

print(enemies)

['Albus Dumbledore', 'Corban Yaxley', 'George Weasley', 'Ginny Weasley', 'Harry Potter', 'Hermione Granger', 'Minerva McGonagall', 'Mrs. Cole', 'Mykew Gregorovitch', 'Quirinus Quirrell', 'Ron Weasley', 'Rubeus Hagrid', 'Sirius Black', 'Tom Riddle']
