In [1]:
import pandas as pd
import string
import matplotlib.pyplot as plt
import nltk
import spacy
import torch
import re
import numpy as np

from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize
from nltk import pos_tag
from nltk import FreqDist
from wordcloud import WordCloud
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
from transformers import pipeline

# nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv("Scripts TBBT.csv", sep = ",", skipinitialspace=True, engine="python")
leonard_df = df[df['person_scene'] == 'Leonard'].copy()

# Extracting season and episode numbers using regular expressions
leonard_df[['season', 'episode']] = leonard_df['episode_name'].str.extract(r'Series (\d+) Episode (\d+)')

## Question 1 - done
On average, how many sentences and words does your character have to speak per
episode? Does this deviate across seasons?

Note that sent_tokenize() also counts the sentences inside parenthesis, which is technically not a sentence spoken by the character.

In [None]:
# Number of sentences in each dialogue (row)
leonard_df['num_sentences'] = leonard_df['dialogue'].apply(lambda x: len(sent_tokenize(str(x))))

Change this section: tokenize before punctuation remove

In [None]:
# Remove punctuation and lowercase function
def remove_punctuation(text):
    # Replace specific punctuation marks with spaces
    text = text.replace("\u2019", ' ') # Unicode for apostrophe
    text = text.replace("\u2013", ' ')  # Unicode for en dash
    text = text.replace("\u2014", ' ')  # Unicode for em dash
    text = text.replace("-", ' ')
    text = text.replace("\u2026", ' ')  # Unicode for ellipsis
    # Remove remaining punctuation marks
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text.lower()

# Remove punctuation and lowercase dialogues
leonard_df['dialogue_no_punct'] = leonard_df['dialogue'].apply(remove_punctuation)

# Number of words per dialogue (row)
leonard_df['num_words'] = leonard_df['dialogue_no_punct'].apply(lambda x: len(word_tokenize(str(x))))

In [None]:
# Sentences per episode
sentences_per_episode = leonard_df.groupby('episode_name')['num_sentences'].sum().reset_index()

# Average sentences per episode
avg_sentences_per_episode = sentences_per_episode['num_sentences'].mean()

In [None]:
# Words per episode
words_per_episode = leonard_df.groupby('episode_name')['num_words'].sum().reset_index()

# Average words per episode
avg_words_per_episode = words_per_episode['num_words'].mean()

In [None]:
leonard_df['season_nr'] = leonard_df['episode_name'].str.extract(r'Series (\d+)')
leonard_df['season_nr'] = leonard_df['season_nr'].astype(float)

sentences_per_season_episode = leonard_df.groupby(['season_nr', 'episode_name'])['num_sentences'].sum().reset_index()
words_per_season_episode = leonard_df.groupby(['season_nr', 'episode_name'])['num_words'].sum().reset_index()

# Average sentences per season-episode
avg_sentences_per_season = sentences_per_season_episode.groupby('season_nr')['num_sentences'].mean().reset_index()

# Average words per season-episode
avg_words_per_season = words_per_season_episode.groupby('season_nr')['num_words'].mean().reset_index()

In [None]:
print("Average Leonard's sentences per episode:", round(avg_sentences_per_episode))
print("Average Leonard's sentences per season-episode:", round(avg_sentences_per_season))
print("Average Leonard's words per episode:", round(avg_words_per_episode))
print("Average Leonard's words per season-episode:", round(avg_words_per_season))

## Question 2
Globally, over all episodes within the first 10 seasons, how many times does your character mention nouns, and person names? Make a Wordcloud of this tag/entity to have a clear visualization which nouns/person names are mostly used by your character.

Very nice results

In [93]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

def extract_persons(tokens):
    persons = []
    ner_persons = nlp(tokens)
    for result in ner_persons:
        if result['entity'] == 'B-PER':
            persons.append(result['word'])
    return persons

leonard_df['persons'] = leonard_df['dialogue'].apply(extract_persons)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [104]:
leonard_df['persons'].sample(5)

54080                                []
49809                                []
41981    [J, ##ab, ##ba, J, ##ab, ##ba]
13113             [Benedict, Ju, ##das]
33721                                []
Name: persons, dtype: object

Weird result for person

In [88]:
tokenizer = AutoTokenizer.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
model = AutoModelForTokenClassification.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
nlp = TokenClassificationPipeline(model=model, tokenizer=tokenizer)

def pos_tag(tokens):
    entities = {'nouns': [], 'persons': []}
    results = nlp(tokens)
    for result in results:
        if result['entity'] == 'NN':
            entities['nouns'].append(result['word'])
        elif result['entity'] == 'NNP':
            entities['persons'].append(result['word'])
    return entities

leonard_df['entities'] = leonard_df['dialogue'].apply(pos_tag)

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [102]:
leonard_df['entities'].sample(5)

42667    []
32749    []
36145    []
3659     []
20975    []
Name: persons, dtype: object

In [None]:
# Create a frequency distribution of nouns and persons
noun_freq = FreqDist(leonard_df['nouns'])
person_freq = FreqDist(leonard_df['persons'])

# Total counts
total_noun_mentions = sum(noun_freq.values())
total_person_mentions = sum(person_freq.values())

print(f'Total Noun Mentions: {total_noun_mentions}')
print(f'Total Person Mentions: {total_person_mentions}')
print(f'Total Unique Noun Mentions: {len(set(nouns))}')
print(f'Total Unique Person Mentions: {len(set(persons))}')

## Question 3
What are the most important words mentioned by your character? Do this analysis per episode, per season and overall over the first 10 seasons. To achieve this task, please first make a bag-of-words and/or use the TF-IDF statistical principle. Remark: You can try to make a Wordcloud for visualization, based on the given bag-of-words.

In [5]:
# Remove '…' and '-'
leonard_df['dialogue'] = leonard_df['dialogue'].str.replace('…', '')
leonard_df['dialogue'] = leonard_df['dialogue'].str.replace('-', '')
# Tokenize and lowercase dialogues
leonard_df['tokenized'] = leonard_df['dialogue'].apply(lambda x: word_tokenize(str(x).lower()))

In [6]:
# Function to remove punctuation from tokenized data
def remove_punctuation(tokens):
    return [token for token in tokens if token not in string.punctuation and token != '’']

# Removing punctuation from tokenized data
leonard_df['no_punctuation'] = leonard_df['tokenized'].apply(remove_punctuation)

In [8]:
# Create sorted bag of words
def create_bow(corpus):    
    bow = set()
    for doc in corpus:
        bow = bow.union(set(doc))
    return sorted(bow)

# Find 10 most important words using TF-IDF
def imp_tf_idf(corpus, bow):
    # Calculate term frequency (TF)
    n_docs = len(corpus)         # Number of documents in the corpus
    n_words_set = len(bow)       # Number of unique words in the corpus

    # Initialize TF matrix
    tf_matrix = np.zeros((n_docs, n_words_set))

    for i in range(n_docs):
        n_words = len(corpus[i])    # Number of words in the document
        for w in corpus[i]:
            word_index = bow.index(w)
            tf_matrix[i][word_index] += 1 / n_words
        
    # Calculate Inverse Document Frequency (IDF)
    idf = np.zeros(n_words_set)
    for idx, w in enumerate(bow):
        k = sum(1 for doc in corpus if w in doc)
        idf[idx] = np.log10(n_docs / k) if k != 0 else 0

    # Calculate TF-IDF matrix
    tf_idf_matrix = tf_matrix * idf

    # Find most important words in the corpus
    flat_tf_idf = tf_idf_matrix.flatten()
    sorted_indices = np.argsort(flat_tf_idf)
    highest_indices = sorted_indices[-10:][::-1]
    highest_values = [bow[i // n_docs] for i in highest_indices]
    return highest_values


Create bag-of-word and apply TF-IDF to determine most important word
1) per episode:
    documents = dialogues
    corpus = episode
2) per season:
    documents = dialogues
    corpus = season
3) overall over 10 seasons
    documents = dialogues
    corpus = whole 10 seasons

In [9]:
# Per episode: documents = dialogues; corpus = episode
result1 = []
for season in leonard_df['season'].unique():
    for episode in leonard_df[leonard_df['season'] == season]['episode'].unique():
        corpus = []
        for row in leonard_df[(leonard_df['season'] == season) & (leonard_df['episode'] == episode)].index:
            corpus.append(leonard_df['no_punctuation'][row])

        # Add words to result
        important_words = imp_tf_idf(corpus, create_bow(corpus))
        result1.append({'season': season, 'episode': episode, 'words': important_words})

result1 = pd.DataFrame(result1)
print(result1)

    season episode                                              words
0       01      01  [so, rest, don, good, behaves, based, colon, d...
1       01      02  [only, but, things, loud, leaves, talk, lets, ...
2       01      03  [you, prove, good, sub, right, quite, didn, wo...
3       01      04  [shake, give, apartment, day, slices, look, lo...
4       01      05  [weren, grow, or, research, hey, no, this, be,...
..     ...     ...                                                ...
226     10      20  [is, to, be, understood, be, pretend, plan, wh...
227     10      21  [of, go, have, t, that, until, any, might, reb...
228     10      22  [t, out, hi, and, doesn, still, thing, child, ...
229     10      23  [opportunity, doing, could, on, on, all, with,...
230     10      24  [follow, pool, ask, at, yeah, yeah, right, how...

[231 rows x 3 columns]


In [81]:
# per season: documents = dialogues; corpus = season
result2 = []
for season in leonard_df['season'].unique():
    corpus = []
    for row in leonard_df[(leonard_df['season'] == season)].index:
        corpus.append(leonard_df['no_punctuation'][row])
        
    # Add words to result
    result2.append({'season': season, 'words': imp_tf_idf(corpus, create_bow(corpus))})

result2 = pd.DataFrame(result2)
print(result2)

  season                                              words
0     01  [drink, off, stands, push, spare, cover, drive...
1     02  [racist, eye, come, dirty, miracle, sit, throu...
2     03  [discarded, standard, at, airplane, announce, ...
3     04  [quiz, fit, today, stuck, buddies, trying, yam...
4     05  [sweaty, leaving, rash, saturday, that, game, ...
5     06  [surprises, chided, no, dog, okay, saying, bec...
6     07  [how, seriously, inappropriate, audition, hone...
7     08  [situation, artistic, ladies, moth, grounds, b...
8     09  [ha, counsellor, try, calm, you, foreever, pot...
9     10  [tried, brought, snapchat, things, sorry, met,...


In [83]:
# whole: documents = dialogues; corpus = whole
corpus = []
for row in leonard_df.index:
    corpus.append(leonard_df['no_punctuation'][row])

# Add words to result
result3 = imp_tf_idf(corpus, create_bow(corpus))

result3 = pd.DataFrame(result3, columns=['words'])
print(result3)

          words
0      curtains
1          exam
2       haircut
3   introducing
4    insightful
5  dissertation
6         birth
7         world
8      nibbling
9     confident


## Question 4
Examine the co-occurence of words for your character by using the Positive Pointwise Mutual Information measurement. Which words are commonly used together in his/her dialogues? Remark: You can try to make a Word-Word co-occurence matrix.

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import defaultdict
from itertools import combinations
from math import log2

dialogues = leonard_df['dialogue'].tolist()

# Step 1: Tokenize by sentence
tokenized_sentences = [sent_tokenize(dialogue) for dialogue in dialogues]

# Step 2: Tokenize by word, add <S> and <E>
tokenized_words = []
for sentences in tokenized_sentences:
    dialogue_tokens = []
    for sentence in sentences:
        words = ['<S>'] + word_tokenize(sentence) + ['<E>']
        dialogue_tokens.extend(words)
    tokenized_words.append(dialogue_tokens)

# Step 3: Create Word-Word co-occurrence matrix
word_count = defaultdict(int)
co_occurrence_matrix = defaultdict(lambda: defaultdict(int))

for dialogue_tokens in tokenized_words:
    for i in range(len(dialogue_tokens)):
        word_count[dialogue_tokens[i]] += 1
        if i < len(dialogue_tokens) - 1:
            co_occurrence_matrix[dialogue_tokens[i]][dialogue_tokens[i + 1]] += 1

# Step 4: Calculate Pointwise Mutual Information (PMI) for word pairs
total_words = sum(word_count.values())
pmi_scores = defaultdict(float)

for word1, word2_counts in co_occurrence_matrix.items():
    for word2, count in word2_counts.items():
        co_occurrence = count
        word1_count = word_count[word1]
        word2_count = word_count[word2]
        
        pmi = log2((co_occurrence / total_words) / ((word1_count / total_words) * (word2_count / total_words)))
        if pmi > 0:
            word_pair = f"{word1}-{word2}"
            pmi_scores[word_pair] = pmi

# Get top 10 word pairs based on PMI
top_10_pmi = sorted(pmi_scores.items(), key=lambda x: x[1], reverse=True)[:10]

print("Top 10 word pairs with Positive Pointwise Mutual Information (PMI):")
for pair, pmi_score in top_10_pmi:
    print(f"{pair}: PMI = {pmi_score:.4f}")


Top 10 word pairs with Positive Pointwise Mutual Information (PMI):
Papa-Doc: PMI = 17.2665
fractional-T1: PMI = 17.2665
T1-bandwidth: PMI = 17.2665
heterosexual-bedrooms: PMI = 17.2665
However-briefly: PMI = 17.2665
biological-impossibility: PMI = 17.2665
fibre-content: PMI = 17.2665
atomic-bomb: PMI = 17.2665
overflow-reservoir: PMI = 17.2665
crossbar-H: PMI = 17.2665
