In [1]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import csv
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.metrics import BigramAssocMeasures
import string
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
import re
from scipy import stats
from nltk.sentiment import SentimentIntensityAnalyzer
import ast
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import math
from scipy.stats import shapiro, mannwhitneyu
import matplotlib.dates as mdates

%matplotlib inline

nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
dimensions = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
polarities = ['virtue', 'vice']

# dictionaries
dictionary_paths = {
    'mft': '/workspaces/debates_analysis/dictionaries/processed/mft_dictionary.json',
    'mfd2': '/workspaces/debates_analysis/dictionaries/processed/mfd2_dictionary.json',
    'mfd1': '/workspaces/debates_analysis/dictionaries/processed/mfd1_dictionary.json',
    'emfd': '/workspaces/debates_analysis/dictionaries/processed/emfd_dictionary.json',
    'ms': '/workspaces/debates_analysis/dictionaries/processed/ms_dictionary.json'
}


loaded_dictionaries = {}
for key, path in dictionary_paths.items():
    with open(path, 'r') as file:
        loaded_dictionaries[key] = json.load(file)

In [16]:
df = pd.read_csv('/workspaces/debates_analysis/debate_transcripts.csv', index_col=0)

In [17]:
def preprocess_text(text):

    pattern = r"\[crosstalk \d{2}:\d{2}:\d{2}\]"
    cleaned_text = re.sub(pattern, "", text)

    # Tokenize the text into words
    tokens = word_tokenize(cleaned_text)

    # Normalize case (except for proper nouns)
    tagged_tokens = pos_tag(tokens)
    tokens = [word.lower() if tag != 'NNP' and tag != 'NNPS' else word for word, tag in tagged_tokens]

    # Remove stopwords
    stop_words_dic = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words_dic]

    # Remove punctuation and numbers, keeping only alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]

    # Lemmatize tokens using the appropriate WordNet tag
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]

    return tokens

In [18]:
df['preprocessed_text'] = df['text'].apply(lambda x: preprocess_text(x))

In [19]:
def score(tokens, dictionary):
    # Initialize scores
    scores = {dimension: {polarity: 0 for polarity in dictionary[dimension]} for dimension in dictionary}
    tracked = {dimension: {polarity: [] for polarity in dictionary[dimension]} for dimension in dictionary}

    # Iterate through each token and update scores directly based on occurrence
    for token in tokens:
        for dimension, polarities in dictionary.items():
            for polarity, entries in polarities.items():
                # take only strings (words) in tuples
                words = [entry if isinstance(entry, str) else entry[0] for entry in entries]

                if token in words:
                    # Increment score by 1 for each occurrence
                    scores[dimension][polarity] += 1
                    # Track the token if not already tracked
                    if token not in tracked[dimension][polarity]:
                        tracked[dimension][polarity].append(token)

    # Flatten scores to a vector
    vector = []
    for dimension in scores.keys():
        for polarity in scores[dimension].keys():
            vector.append(scores[dimension][polarity])

    return vector

In [20]:
for key, dictionary in loaded_dictionaries.items():
    df[key] = df['preprocessed_text'].apply(lambda x: score(x, dictionary))

In [21]:
df.head()

Unnamed: 0,speaker,time,text,debate,preprocessed_text,mft,mfd2,mfd1,emfd,ms
0,Moderator,00:01:20,Good evening from the Health Education Campus ...,1,"[good, evening, Health, Education, Campus, Cas...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 2, 2, 0, 0]","[4, 0, 4, 0, 4, 0, 4, 0, 4, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Moderator,00:02:10,This debate is being conducted under health an...,1,"[debate, conducted, health, safety, protocol, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 0, 0, 1, 0, 0, 0, 3, 0, 0]","[5, 1, 5, 1, 5, 1, 5, 1, 5, 1]","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
2,Joe Biden,00:02:49,"How you doing, man?",1,[man],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,Donald Trump,00:02:51,How are you doing?,1,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Joe Biden,00:02:51,I’m well.,1,[well],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [22]:
df.shape

(1301, 10)