In [84]:
#import dataset
import pandas as pd
path = 'Scripts TBBT.csv'
df = pd.read_csv(path,sep=',',skipinitialspace=True,engine='python')

In [85]:
# Remove every punctuation from the dataframe
import string

# Function to remove punctuations
def remove_punctuations(text):

    # Check if the value is a string, then remove punctuations
    if isinstance(text, str):
        return text.translate(str.maketrans('', '', string.punctuation))
    else:
        return text

# Removing punctuations from string columns
for column in df.columns:
    df[column] = df[column].apply(remove_punctuations)

In [86]:
# Filter for person_scene that contains Leonard
df = df[df['person_scene'].str.contains('Leonard')]

In [87]:
# Lowercase the whole dataframe
df = df.apply(lambda x: x.astype(str).str.lower())

In [88]:
# Remove stopwords with NLTK for column dialogue
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
df['dialogue'] = df['dialogue'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\camd1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [89]:
# Use the Porter stemmer to perform stemming on the used words in dialogue
from nltk.stem import PorterStemmer
porter = PorterStemmer()
df['dialogue'] = df['dialogue'].apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))

In [90]:
# Perform POS tagging & NER tagging on the dialogue column with Spacy
import spacy
nlp = spacy.load('en_core_web_sm')
# Function to perform POS tagging and NER tagging
def pos_ner_tagging(text):
    doc = nlp(text)
    pos_tags = [(token.text, token.pos_) for token in doc]
    ner_tags = [(entity.text, entity.label_) for entity in doc.ents]
    return pos_tags, ner_tags

# Apply POS and NER tagging to the 'dialogue' column
df[['POS_tags', 'NER_tags']] = df['dialogue'].apply(lambda x: pd.Series(pos_ner_tagging(x)))

In [91]:
# Print random 20 rows of the dataframe
df.sample(10)

Unnamed: 0,episode_name,dialogue,person_scene,POS_tags,NER_tags
50720,series 10 episode 09 – the geology elevation,well chang plaqu mixedrac coupl grandma,leonard,"[(well, INTJ), (chang, PROPN), (plaqu, PROPN),...","[(chang, PERSON), (mixedrac coupl, PERSON)]"
23993,series 05 episode 14 – the beta test initiation,thank,leonard,"[(thank, VERB)]",[]
3631,series 01 episode 15 – the porkchop indeterminacy,two know,leonard,"[(two, NUM), (know, VERB)]","[(two, CARDINAL)]"
33018,series 07 episode 08 – the itchy brain simulation,look there’ there’ there’ ticket,leonard,"[(look, VERB), (there, ADV), (’, PUNCT), (ther...",[]
12347,series 03 episode 11 – the maternal congruence,it’,leonard,"[(it, PRON), (’, PUNCT)]",[]
44050,series 09 episode 06 – the helium insufficiency,let’ start experi,leonard,"[(let, VERB), (’, PUNCT), (start, VERB), (expe...",[]
43955,series 09 episode 06 – the helium insufficiency,sure,leonard,"[(sure, ADV)]",[]
3384,series 01 episode 14 – the nerdvana annihilation,come sniper,leonard,"[(come, VERB), (sniper, NOUN)]",[]
51985,series 10 episode 15 – the locomotion reverber...,good cut edg new technolog still make inapprop...,leonard,"[(good, ADJ), (cut, VERB), (edg, NOUN), (new, ...",[]
31883,series 07 episode 03 – the scavenger vortex,great,leonard,"[(great, ADJ)]",[]
