# Project 1

In [1]:
import pandas as pd
import string
import nltk
import re
import io 
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize
from nltk import pos_tag
from nltk import FreqDist
from wordcloud import WordCloud
from nltk.corpus import stopwords

In [2]:
# path = r"C:\Users\alice\Desktop\UNIBO\DATA SCIENCE\SECOND YEAR\TEXT MINING\Project1\Scripts TBBT.csv"
df = pd.read_csv("Scripts TBBT.csv", sep = ",", skipinitialspace=True, engine="python")
len(df)

54406

In [3]:
leonard_df = df[df['person_scene'] == 'Leonard'].copy()

## Question 1
On average, how many sentences and words does your character have to speak per
episode? Does this deviate across seasons?

In [5]:
# Number of sentences in each dialogue (row)
leonard_df['num_sentences'] = leonard_df['dialogue'].apply(lambda x: len(sent_tokenize(str(x))))

In [6]:
# Remove punctuation and lowercase function
def remove_punctuation(text):
    # Replace specific punctuation marks with spaces
    text = text.replace("\u2019", ' ') # Unicode for apostrophe
    text = text.replace("\u2013", ' ')  # Unicode for en dash
    text = text.replace("\u2014", ' ')  # Unicode for em dash
    text = text.replace("-", ' ')
    text = text.replace("\u2026", ' ')  # Unicode for ellipsis
    # Remove remaining punctuation marks
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text.lower()

# Remove punctuation and lowercase dialogues
leonard_df['dialogue_no_punct'] = leonard_df['dialogue'].apply(remove_punctuation)

# Number of words per dialogue (row)
leonard_df['num_words'] = leonard_df['dialogue_no_punct'].apply(lambda x: len(word_tokenize(str(x))))
leonard_df.head()

Unnamed: 0,episode_name,dialogue,person_scene,num_sentences,dialogue_no_punct,num_words
2,Series 01 Episode 01 – Pilot Episode,"Agreed, what’s your point?",Leonard,1,agreed what s your point,5
4,Series 01 Episode 01 – Pilot Episode,Excuse me?,Leonard,1,excuse me,2
6,Series 01 Episode 01 – Pilot Episode,"One across is Aegean, eight down is Nabakov, ...",Leonard,3,one across is aegean eight down is nabakov tw...,39
8,Series 01 Episode 01 – Pilot Episode,"Yes. Um, is this the High IQ sperm bank?",Leonard,2,yes um is this the high iq sperm bank,9
12,Series 01 Episode 01 – Pilot Episode,Thank-you. We’ll be right back.,Leonard,2,thank you we ll be right back,7


In [7]:
# Sentences per episode
sentences_per_episode = leonard_df.groupby('episode_name')['num_sentences'].sum().reset_index()

# Average sentences per episode
avg_sentences_per_episode = sentences_per_episode['num_sentences'].mean()

In [8]:
# Words per episode
words_per_episode = leonard_df.groupby('episode_name')['num_words'].sum().reset_index()

# Average words per episode
avg_words_per_episode = words_per_episode['num_words'].mean()

In [9]:
leonard_df['season_nr'] = leonard_df['episode_name'].str.extract(r'Series (\d+)')
leonard_df['season_nr'] = leonard_df['season_nr'].astype(float)

sentences_per_season_episode = leonard_df.groupby(['season_nr', 'episode_name'])['num_sentences'].sum().reset_index()
words_per_season_episode = leonard_df.groupby(['season_nr', 'episode_name'])['num_words'].sum().reset_index()

# Average sentences per season-episode
avg_sentences_per_season = sentences_per_season_episode.groupby('season_nr')['num_sentences'].mean().reset_index()

# Average words per season-episode
avg_words_per_season = words_per_season_episode.groupby('season_nr')['num_words'].mean().reset_index()

In [10]:
print("Average Leonard's sentences per episode:", round(avg_sentences_per_episode))
print("Average Leonard's sentences per season-episode:", round(avg_sentences_per_season))
print("Average Leonard's words per episode:", round(avg_words_per_episode))
print("Average Leonard's words per season-episode:", round(avg_words_per_season))

Average Leonard's sentences per episode: 62
Average Leonard's sentences per season-episode:    season_nr  num_sentences
0        1.0           92.0
1        2.0           79.0
2        3.0           76.0
3        4.0           67.0
4        5.0           61.0
5        6.0           53.0
6        7.0           54.0
7        8.0           54.0
8        9.0           50.0
9       10.0           44.0
Average Leonard's words per episode: 436
Average Leonard's words per season-episode:    season_nr  num_words
0        1.0      713.0
1        2.0      543.0
2        3.0      509.0
3        4.0      422.0
4        5.0      420.0
5        6.0      367.0
6        7.0      389.0
7        8.0      396.0
8        9.0      352.0
9       10.0      333.0


## Question 2
Globally, over all episodes within the first 10 seasons, how many times does your
character mention nouns, and person names? Make a Wordcloud of this tag/entity to
have a clear visualization which nouns/person names are mostly used by your character.

In [11]:
# Tokenize 
leonard_df['tokens'] = leonard_df['dialogue_no_punct'].apply(word_tokenize)

In [12]:
# Remove stopwords
stop_words = set(stopwords.words('english')) 
 
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

leonard_df['tokens_no_stop'] = leonard_df['tokens'].apply(remove_stopwords) 
leonard_df['dialogue_no_stop'] = leonard_df['tokens_no_stop'].apply(lambda tokens: ' '.join(tokens))

In [13]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [14]:
def extract_entities(tokens):
    entities = {'nouns': [], 'persons': []}
    doc = nlp(" ".join(tokens))
    for token in doc:
        if token.pos_ == 'NOUN' and token.ent_type_ != 'PERSON':
            entities['nouns'].append(token.lemma_)
        elif token.ent_type_ == 'PERSON':
            entities['persons'].append(token.text)
    return entities

leonard_df['entities'] = leonard_df['tokens_no_stop'].apply(extract_entities)

In [15]:
# Flatten the lists of nouns and persons
all_nouns = [noun for sublist in leonard_df['entities'].apply(lambda x: x['nouns']) for noun in sublist]
all_persons = [person for sublist in leonard_df['entities'].apply(lambda x: x['persons']) for person in sublist]

This one is not perfect but works definitely better.  
I manually remove some wrongly classified words.

In [16]:
not_nouns = ["sheldon", "penny", "psst", "co", "lot", "talk", "call", "know"]
all_nouns = [noun for noun in all_nouns if noun not in not_nouns]
not_persons = ["ho", "yo", "mm", "come", "want", "little", "need", "relax", "reason", "long", "make", "hey", "tell", "talking", "knoks", "hang", "sorry", 
               "listen", "hmm", "cousin", "sister", "cool", "bisexual", "hell", "kinda", "rider", "head", "bag", "mmm", "hee", "talk", "cinnamon", "night",
              "right", "bad", "fine", "lesbian", "know", "moo", "boy", "huh", "time", "nye", "na", "buh", "terrific", "heard"]
all_persons = [noun for noun in all_persons if noun not in not_persons]

In [17]:
# Create a frequency distribution of nouns and persons
noun_freq = FreqDist(all_nouns)
person_freq = FreqDist(all_persons)

# Total counts
total_noun_mentions = sum(noun_freq.values())
total_person_mentions = sum(person_freq.values())

print(f'Total Noun Mentions: {total_noun_mentions}')
print(f'Total Person Mentions: {total_person_mentions}')
print(f'Total Unique Noun Mentions: {len(set(all_nouns))}')
print(f'Total Unique Person Mentions: {len(set(all_persons))}')

Total Noun Mentions: 12706
Total Person Mentions: 872
Total Unique Noun Mentions: 3134
Total Unique Person Mentions: 408


We have to consider that there are still errors in how the words were tagged, but from the obtained results Leonard mentioned 3134 unique nouns for a total of 12706 times and 408 unique person names for a total of 872 times.

In [18]:
# Generate Wordclouds
noun_wordcloud = WordCloud(width=800, height=400, background_color='white', font_path="./Sabandija-font-ffp.ttf", random_state=12).generate_from_frequencies(noun_freq)
person_wordcloud = WordCloud(width=800, height=400, background_color='white', font_path="./Sabandija-font-ffp.ttf", random_state=12).generate_from_frequencies(person_freq)

# Plot the Wordclouds
plt.figure(figsize=(15, 10))
plt.subplot(1, 2, 1)
plt.imshow(noun_wordcloud, interpolation='bilinear')
plt.title('Noun Wordcloud')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(person_wordcloud, interpolation='bilinear')
plt.title('Person Name Wordcloud')
plt.axis('off')

plt.show()

OSError: cannot open resource

First thing to notice is that there are some errors in both groups, mostly among the person names (eg. "need" is classified as a noun and "vacuum" is classified as a person name).  
The mostly used nouns are 'time', 'guy', 'thing', 'night' and 'way'. The mostly used person names are 'sheldon', 'howard', 'bernadette', 'cooper' and 'leonard'.

## Question 3
What are the most important words mentioned by your character? Do this analysis per episode, per season and overall over the first 10 seasons. To achieve this task, please first make a bag-of-words and/or use the TF-IDF statistical principle. Remark: You can try to make a Wordcloud for visualization, based on the given bag-of-words.

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Combine tokens_no_stop into a single string for each row
leonard_df['text_combined'] = leonard_df['tokens_no_stop'].apply(lambda x: ' '.join(x))

# Bag-of-Words representation
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(leonard_df['dialogue_no_stop'])

# TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(leonard_df['dialogue_no_stop'])

# Convert the matrices into DataFrames
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Combine the DataFrames with the original DataFrame
leonard_df = pd.concat([leonard_df, bow_df, tfidf_df], axis=1)

In [21]:
# Group by episode and season
episode_bow = leonard_df.groupby('episode_name')[count_vectorizer.get_feature_names_out()].sum()
episode_tfidf = leonard_df.groupby('episode_name')[tfidf_vectorizer.get_feature_names_out()].sum()

season_bow = leonard_df.groupby('season_nr')[count_vectorizer.get_feature_names_out()].sum()
season_tfidf = leonard_df.groupby('season_nr')[tfidf_vectorizer.get_feature_names_out()].sum()

# Overall analysis
overall_bow = bow_df.sum()
overall_tfidf = tfidf_df.sum()

In [22]:
def extract_top_words_per_row(matrix, top_n=5):
    # Get the top N words for each row
    top_words_per_row = matrix.apply(lambda row: row.nlargest(top_n).index.tolist(), axis=1)
    return top_words_per_row

# Extract top words for each row
top_words_per_row_season_bow = extract_top_words_per_row(season_bow)
top_words_per_row_season_tfidf = extract_top_words_per_row(season_tfidf)
top_words_per_row_episode_bow = extract_top_words_per_row(episode_bow)
top_words_per_row_episode_tfidf = extract_top_words_per_row(episode_tfidf)

# Display the top words for each row
print("Top 5 words per season (BoW):")
print(top_words_per_row_season_bow)

print("\nTop 5 words per season (TF-IDF):")
print(top_words_per_row_season_tfidf)

print("\nTop 5 words per episode (BoW):")
print(top_words_per_row_episode_bow)

print("\nTop 5 words per episode (TF-IDF):")
print(top_words_per_row_episode_tfidf)

Top 5 words per season (BoW):
season_nr
1.0        [sheldon, oh, know, yeah, penny]
2.0           [oh, know, sheldon, hey, get]
3.0     [000318914, 000318914, 10, 10, 100]
4.0     [000318914, 000318914, 10, 10, 100]
5.0     [000318914, 000318914, 10, 10, 100]
6.0     [000318914, 000318914, 10, 10, 100]
7.0     [000318914, 000318914, 10, 10, 100]
8.0     [000318914, 000318914, 10, 10, 100]
9.0     [000318914, 000318914, 10, 10, 100]
10.0    [000318914, 000318914, 10, 10, 100]
dtype: object

Top 5 words per season (TF-IDF):
season_nr
1.0        [sheldon, oh, know, yeah, penny]
2.0           [oh, know, sheldon, hey, get]
3.0     [000318914, 000318914, 10, 10, 100]
4.0     [000318914, 000318914, 10, 10, 100]
5.0     [000318914, 000318914, 10, 10, 100]
6.0     [000318914, 000318914, 10, 10, 100]
7.0     [000318914, 000318914, 10, 10, 100]
8.0     [000318914, 000318914, 10, 10, 100]
9.0     [000318914, 000318914, 10, 10, 100]
10.0    [000318914, 000318914, 10, 10, 100]
dtype: object

Top 5 w