In [55]:
import pandas as pd

Source for how to import data:  
https://www.kaggle.com/shashankasubrahmanya/preprocessing-cornell-movie-dialogue-corpus

In [56]:
movie_lines_features = ["LineID", "Character", "Movie", "Name", "Line"]


In [57]:
movie_lines = pd.read_csv("data/movie_lines.txt", sep = "\+\+\+\$\+\+\+", engine = "python", encoding='ISO-8859-1', index_col = False, names = movie_lines_features)


In [58]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [59]:
# Strip the space from "LineID" for further usage and change the datatype of "Line"
movie_lines["LineID"] = movie_lines["LineID"].apply(str.strip)
movie_lines["Line"] = movie_lines["Line"].apply(str)

In [60]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [61]:
# Remove lines that do not contain he or she pronouns
# Only include lines where ' he ' is in middle of sentence, surrounded by at least 1 space on either side
movie_lines = movie_lines[movie_lines["Line"].str.contains('|'.join([' he ', ' she ']))]

In [62]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
33,L406,u2,m0,CAMERON,So that's the kind of guy she likes? Pretty o...
36,L403,u2,m0,CAMERON,I'm workin' on it. But she doesn't seem to be...
57,L205,u0,m0,BIANCA,Unsolved mystery. She used to be really popu...
59,L203,u2,m0,CAMERON,Seems like she could get a date easy enough...
60,L202,u0,m0,BIANCA,"The thing is, Cameron -- I'm at the mercy of ..."


In [63]:
movie_lines.iloc[[15]]

Unnamed: 0,LineID,Character,Movie,Name,Line
228,L217,u7,m0,MICHAEL,No kidding. He's a criminal. I heard he lit...


In [64]:
# Tokenize and segment
# Drop rows without pronouns
import nltk
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michellelum/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [66]:
movie_lines["Segmented_Line"] = movie_lines["Line"].apply(sent_tokenize)

In [67]:
def tokenize_sentence_list(sentence_list):
    return [word_tokenize(sentence) for sentence in sentence_list]

In [68]:
movie_lines["Tokenized_Line"] = movie_lines["Segmented_Line"].apply(tokenize_sentence_list)

In [69]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line
33,L406,u2,m0,CAMERON,So that's the kind of guy she likes? Pretty o...,"[ So that's the kind of guy she likes?, Pretty...","[[So, that, 's, the, kind, of, guy, she, likes..."
36,L403,u2,m0,CAMERON,I'm workin' on it. But she doesn't seem to be...,"[ I'm workin' on it., But she doesn't seem to ...","[[I, 'm, workin, ', on, it, .], [But, she, doe..."
57,L205,u0,m0,BIANCA,Unsolved mystery. She used to be really popu...,"[ Unsolved mystery., She used to be really pop...","[[Unsolved, mystery, .], [She, used, to, be, r..."
59,L203,u2,m0,CAMERON,Seems like she could get a date easy enough...,[ Seems like she could get a date easy enough...],"[[Seems, like, she, could, get, a, date, easy,..."
60,L202,u0,m0,BIANCA,"The thing is, Cameron -- I'm at the mercy of ...","[ The thing is, Cameron -- I'm at the mercy of...","[[The, thing, is, ,, Cameron, --, I, 'm, at, t..."


In [74]:
movie_lines.shape

(13521, 7)