Character: Leonard

In [1]:
#import dataset
import pandas as pd
path = 'Scripts TBBT.csv'
df = pd.read_csv(path,sep=',',skipinitialspace=True,engine='python')

In [2]:
# Remove every punctuation from the dataframe
import string

# Function to remove punctuations
def remove_punctuations(text):

    # Check if the value is a string, then remove punctuations
    if isinstance(text, str):
        return text.translate(str.maketrans('', '', string.punctuation))
    else:
        return text

# Removing punctuations from string columns
for column in df.columns:
    df[column] = df[column].apply(remove_punctuations)

In [3]:
# Filter for person_scene that contains Leonard
df = df[df['person_scene'].str.contains('Leonard')]

In [4]:
# Lowercase the whole dataframe
df = df.apply(lambda x: x.astype(str).str.lower())

In [5]:
# Remove stopwords with NLTK for column dialogue
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
df['dialogue'] = df['dialogue'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\camd1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Use the Porter stemmer to perform stemming on the used words in dialogue
from nltk.stem import PorterStemmer
porter = PorterStemmer()
df['stem'] = df['dialogue'].apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))

In [7]:
# Perform POS tagging & NER tagging on the dialogue column with Spacy
import spacy
nlp = spacy.load('en_core_web_sm')
# Function to perform POS tagging and NER tagging
def pos_ner_tagging(text):
    doc = nlp(text)
    pos_tags = [(token.text, token.pos_) for token in doc]
    ner_tags = [(entity.text, entity.label_) for entity in doc.ents]
    return pos_tags, ner_tags

# Apply POS and NER tagging to the 'dialogue' column
df[['POS_tags', 'NER_tags']] = df['dialogue'].apply(lambda x: pd.Series(pos_ner_tagging(x)))

In [8]:
# Print random rows of the dataframe
df.sample(10)

Unnamed: 0,episode_name,dialogue,person_scene
8190,Series 02 Episode 16 – The Cushion Saturation,Where are you going,Leonard
8025,Series 02 Episode 16 – The Cushion Saturation,Okay that should do it,Leonard
15849,Series 04 Episode 03 – The Zazzy Substitution,Surprise,Leonard
30453,Series 06 Episode 20 – The Tenure Turbulence,Oh that’s nice but itit’s just gonna be a roo...,Leonard
48151,Series 09 Episode 23 – The Line Substitution S...,So when the aliens brought you back they just...,Leonard
6497,Series 02 Episode 09 – The White Asparagus Tri...,Excuse me What the hell is wrong with you,Leonard
19060,Series 04 Episode 17 – The Toast Derivation,Can’t we make a onetime exception for tonight,Leonard
39529,Series 08 Episode 11 – The Clean Room Infiltra...,No admit that you contaminated the clean room,Leonard
4234,Series 01 Episode 17 – The Tangerine Factor,Yeah That’s the spirit,Leonard
25558,Series 05 Episode 22 – The Stag Convergence,Are you drinking whisky,Leonard
