In [76]:
import pandas as pd

Source for how to import data:  
https://www.kaggle.com/shashankasubrahmanya/preprocessing-cornell-movie-dialogue-corpus

In [77]:
movie_lines_features = ["LineID", "Character", "Movie", "Name", "Line"]


In [78]:
movie_lines = pd.read_csv("data/movie_lines.txt", sep = "\+\+\+\$\+\+\+", engine = "python", encoding='ISO-8859-1', index_col = False, names = movie_lines_features)


In [79]:
movie_lines = movie_lines.sample(frac=0.01)

In [80]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
53166,L328484,u1558,m104,BILL,"This is Louisiana, chief. How the hell do yo..."
78970,L422310,u2355,m151,ELLIS,How come're you doin that?
76880,L414781,u2298,m148,MR LANTZ,Krueger.
267202,L549501,u7883,m533,JACKIE,They came out of my <u>body</u>!
154291,L128194,u4460,m295,MASON,...though.


In [81]:
# Strip the space from "LineID" for further usage and change the datatype of "Line"
movie_lines["LineID"] = movie_lines["LineID"].apply(str.strip)
movie_lines["Line"] = movie_lines["Line"].apply(str)
movie_lines["Line"] = movie_lines["Line"].apply(str.lower)

In [82]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
53166,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo..."
78970,L422310,u2355,m151,ELLIS,how come're you doin that?
76880,L414781,u2298,m148,MR LANTZ,krueger.
267202,L549501,u7883,m533,JACKIE,they came out of my <u>body</u>!
154291,L128194,u4460,m295,MASON,...though.


In [83]:
# # Remove lines that do not contain he or she pronouns
# # Only include lines where ' he ' is in middle of sentence, surrounded by at least 1 space on either side
# movie_lines = movie_lines[movie_lines["Line"].str.contains('|'.join([' he ', ' she ']))]

In [84]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
53166,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo..."
78970,L422310,u2355,m151,ELLIS,how come're you doin that?
76880,L414781,u2298,m148,MR LANTZ,krueger.
267202,L549501,u7883,m533,JACKIE,they came out of my <u>body</u>!
154291,L128194,u4460,m295,MASON,...though.


In [85]:
movie_lines.iloc[[15]]

Unnamed: 0,LineID,Character,Movie,Name,Line
53841,L312909,u1598,m105,ORDELL,i need a favor.


In [86]:
# Tokenize and segment
# Drop rows without pronouns
import nltk
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nicoespinosadice/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [87]:
movie_lines["Segmented_Line"] = movie_lines["Line"].apply(sent_tokenize)

In [88]:
df = pd.DataFrame(columns=movie_lines.columns)

In [89]:
df.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line


In [90]:
for row in movie_lines.iterrows():
    for sentence in row[1]["Segmented_Line"]:
        line_id = row[1]["LineID"]
        character = row[1]["Character"]
        movie = row[1]["Movie"]
        name = row[1]["Name"]
        line = row[1]["Line"]
        segmented = sentence
        new_row = {"LineID":line_id, "Character": character,
                   "Movie":movie,"Name":name,"Line":line,
                   "Segmented_Line":segmented}
        df = df.append(new_row, ignore_index=True)

In [91]:
df.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line
0,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...","this is louisiana, chief."
1,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...",how the hell do you know who your daddy is?
2,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...",'cause your momma told you so... you're way ou...
3,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...","jim sighs, saddened."
4,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...",bill was one of his best men.


In [92]:
movie_lines = df

In [75]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line


In [67]:
# def tokenize_sentence_list(sentence_list):
#     return [word_tokenize(sentence) for sentence in sentence_list]

In [93]:
movie_lines["Tokenized_Line"] = movie_lines["Segmented_Line"].apply(word_tokenize)


In [94]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line
0,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...","this is louisiana, chief.","[this, is, louisiana, ,, chief, .]"
1,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...",how the hell do you know who your daddy is?,"[how, the, hell, do, you, know, who, your, dad..."
2,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...",'cause your momma told you so... you're way ou...,"['cause, your, momma, told, you, so, ..., you,..."
3,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...","jim sighs, saddened.","[jim, sighs, ,, saddened, .]"
4,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...",bill was one of his best men.,"[bill, was, one, of, his, best, men, .]"


In [98]:
def get_pronoun(tokenized_line):
    if "she" in tokenized_line:
        return "she"
    elif "he" in tokenized_line:
        return "he"
    else:
        return "none"

In [99]:
movie_lines["Pronoun"] = movie_lines["Tokenized_Line"].apply(get_pronoun)

In [100]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line,Pronoun
0,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...","this is louisiana, chief.","[this, is, louisiana, ,, chief, .]",none
1,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...",how the hell do you know who your daddy is?,"[how, the, hell, do, you, know, who, your, dad...",none
2,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...",'cause your momma told you so... you're way ou...,"['cause, your, momma, told, you, so, ..., you,...",none
3,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...","jim sighs, saddened.","[jim, sighs, ,, saddened, .]",none
4,L328484,u1558,m104,BILL,"this is louisiana, chief. how the hell do yo...",bill was one of his best men.,"[bill, was, one, of, his, best, men, .]",none


In [105]:
movie_lines = movie_lines.loc[movie_lines["Pronoun"] != "none"]

In [106]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line,Pronoun
42,L157024,u4771,m317,TOBY,he's my contact for chrissake! there's two cr...,he's my contact for chrissake!,"[he, 's, my, contact, for, chrissake, !]",he
58,L103969,u599,m38,ANTHONY,i don't think you're giving him enough credit...,i know sometimes he doesn't think an idea thro...,"[i, know, sometimes, he, does, n't, think, an,...",he
59,L103969,u599,m38,ANTHONY,i don't think you're giving him enough credit...,he gets too excited.,"[he, gets, too, excited, .]",he
85,L342343,u6271,m419,MR. GUMB,"no. wait... was she a great, fat person? i ma...","wait... was she a great, fat person?","[wait, ..., was, she, a, great, ,, fat, person...",she
113,L208339,u920,m60,DUKE,"if so -- well, we'll just have to cut his hea...",he'd report us at once to some kind of outback...,"[he, 'd, report, us, at, once, to, some, kind,...",he


In [107]:
movie_lines.describe()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line,Pronoun
count,328,328,328,328,328,328,328,328
unique,276,258,206,233,276,328,327,2
top,L480951,u7936,m537,JEFF,"jonah, this isn't fair. you don't know victo...",he's my contact for chrissake!,"[what, did, she, say, ?]",he
freq,5,6,6,6,5,1,2,224


In [109]:
movie_lines.shape

(328, 8)