In [1]:
import pandas as pd

Source for how to import data:  
https://www.kaggle.com/shashankasubrahmanya/preprocessing-cornell-movie-dialogue-corpus

In [2]:
movie_lines_features = ["LineID", "Character", "Movie", "Name", "Line"]


In [3]:
movie_lines = pd.read_csv("data/movie_lines.txt", sep = "\+\+\+\$\+\+\+", engine = "python", encoding='ISO-8859-1', index_col = False, names = movie_lines_features)


In [4]:
movie_lines = movie_lines.sample(frac=0.01)

In [5]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
86417,L469228,u2557,m165,WELLES,Hearst looks down at the world at his feet Ev...
9153,L37362,u334,m20,EVELYN,"Oh God, I can't believe this."
255662,L501385,u7593,m513,JACKIE,"If you screw me up for tonight, I'll kill you..."
151938,L126716,u4421,m293,REGGIE,"Well, I guess that's it -- dead end."
194227,L258742,u5677,m376,FRANK,Play that sweet one you know. The one makes m...


In [6]:
# Strip the space from "LineID" for further usage and change the datatype of "Line"
movie_lines["LineID"] = movie_lines["LineID"].apply(str.strip)
movie_lines["Line"] = movie_lines["Line"].apply(str)
movie_lines["Line"] = movie_lines["Line"].apply(str.lower)

In [7]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
86417,L469228,u2557,m165,WELLES,hearst looks down at the world at his feet ev...
9153,L37362,u334,m20,EVELYN,"oh god, i can't believe this."
255662,L501385,u7593,m513,JACKIE,"if you screw me up for tonight, i'll kill you..."
151938,L126716,u4421,m293,REGGIE,"well, i guess that's it -- dead end."
194227,L258742,u5677,m376,FRANK,play that sweet one you know. the one makes m...


In [8]:
# # Remove lines that do not contain he or she pronouns
# # Only include lines where ' he ' is in middle of sentence, surrounded by at least 1 space on either side
# movie_lines = movie_lines[movie_lines["Line"].str.contains('|'.join([' he ', ' she ']))]

In [9]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
86417,L469228,u2557,m165,WELLES,hearst looks down at the world at his feet ev...
9153,L37362,u334,m20,EVELYN,"oh god, i can't believe this."
255662,L501385,u7593,m513,JACKIE,"if you screw me up for tonight, i'll kill you..."
151938,L126716,u4421,m293,REGGIE,"well, i guess that's it -- dead end."
194227,L258742,u5677,m376,FRANK,play that sweet one you know. the one makes m...


In [10]:
movie_lines.iloc[[15]]

Unnamed: 0,LineID,Character,Movie,Name,Line
154225,L129516,u4451,m295,BRYNNER,they took that life away and left me with not...


In [11]:
# Tokenize and segment
# Drop rows without pronouns
import nltk
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michellelum/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
movie_lines["Segmented_Line"] = movie_lines["Line"].apply(sent_tokenize)

In [13]:
df = pd.DataFrame(columns=movie_lines.columns)

In [14]:
df.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line


In [15]:
for row in movie_lines.iterrows():
    for sentence in row[1]["Segmented_Line"]:
        line_id = row[1]["LineID"]
        character = row[1]["Character"]
        movie = row[1]["Movie"]
        name = row[1]["Name"]
        line = row[1]["Line"]
        segmented = sentence
        new_row = {"LineID":line_id, "Character": character,
                   "Movie":movie,"Name":name,"Line":line,
                   "Segmented_Line":segmented}
        df = df.append(new_row, ignore_index=True)

In [16]:
df.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line
0,L469228,u2557,m165,WELLES,hearst looks down at the world at his feet ev...,hearst looks down at the world at his feet ev...
1,L37362,u334,m20,EVELYN,"oh god, i can't believe this.","oh god, i can't believe this."
2,L501385,u7593,m513,JACKIE,"if you screw me up for tonight, i'll kill you...","if you screw me up for tonight, i'll kill you..."
3,L126716,u4421,m293,REGGIE,"well, i guess that's it -- dead end.","well, i guess that's it -- dead end."
4,L258742,u5677,m376,FRANK,play that sweet one you know. the one makes m...,play that sweet one you know.


In [17]:
movie_lines = df

In [18]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line
0,L469228,u2557,m165,WELLES,hearst looks down at the world at his feet ev...,hearst looks down at the world at his feet ev...
1,L37362,u334,m20,EVELYN,"oh god, i can't believe this.","oh god, i can't believe this."
2,L501385,u7593,m513,JACKIE,"if you screw me up for tonight, i'll kill you...","if you screw me up for tonight, i'll kill you..."
3,L126716,u4421,m293,REGGIE,"well, i guess that's it -- dead end.","well, i guess that's it -- dead end."
4,L258742,u5677,m376,FRANK,play that sweet one you know. the one makes m...,play that sweet one you know.


In [19]:
# def tokenize_sentence_list(sentence_list):
#     return [word_tokenize(sentence) for sentence in sentence_list]

In [20]:
movie_lines["Tokenized_Line"] = movie_lines["Segmented_Line"].apply(word_tokenize)


In [21]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line
0,L469228,u2557,m165,WELLES,hearst looks down at the world at his feet ev...,hearst looks down at the world at his feet ev...,"[hearst, looks, down, at, the, world, at, his,..."
1,L37362,u334,m20,EVELYN,"oh god, i can't believe this.","oh god, i can't believe this.","[oh, god, ,, i, ca, n't, believe, this, .]"
2,L501385,u7593,m513,JACKIE,"if you screw me up for tonight, i'll kill you...","if you screw me up for tonight, i'll kill you...","[if, you, screw, me, up, for, tonight, ,, i, '..."
3,L126716,u4421,m293,REGGIE,"well, i guess that's it -- dead end.","well, i guess that's it -- dead end.","[well, ,, i, guess, that, 's, it, --, dead, en..."
4,L258742,u5677,m376,FRANK,play that sweet one you know. the one makes m...,play that sweet one you know.,"[play, that, sweet, one, you, know, .]"


In [22]:
def get_pronoun(tokenized_line):
    if "she" in tokenized_line:
        return "she"
    elif "he" in tokenized_line:
        return "he"
    else:
        return "none"

In [23]:
movie_lines["Pronoun"] = movie_lines["Tokenized_Line"].apply(get_pronoun)

In [24]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line,Pronoun
0,L469228,u2557,m165,WELLES,hearst looks down at the world at his feet ev...,hearst looks down at the world at his feet ev...,"[hearst, looks, down, at, the, world, at, his,...",none
1,L37362,u334,m20,EVELYN,"oh god, i can't believe this.","oh god, i can't believe this.","[oh, god, ,, i, ca, n't, believe, this, .]",none
2,L501385,u7593,m513,JACKIE,"if you screw me up for tonight, i'll kill you...","if you screw me up for tonight, i'll kill you...","[if, you, screw, me, up, for, tonight, ,, i, '...",none
3,L126716,u4421,m293,REGGIE,"well, i guess that's it -- dead end.","well, i guess that's it -- dead end.","[well, ,, i, guess, that, 's, it, --, dead, en...",none
4,L258742,u5677,m376,FRANK,play that sweet one you know. the one makes m...,play that sweet one you know.,"[play, that, sweet, one, you, know, .]",none


In [25]:
movie_lines = movie_lines.loc[movie_lines["Pronoun"] != "none"]

In [26]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line,Pronoun
22,L380296,u6576,m437,ISABELLA,angola. my mother was being a surgeon there....,she volunteered.,"[she, volunteered, .]",she
23,L380296,u6576,m437,ISABELLA,angola. my mother was being a surgeon there....,"mozambique and mali, too... ...those days, in...","[mozambique, and, mali, ,, too, ..., ..., thos...",she
24,L380296,u6576,m437,ISABELLA,angola. my mother was being a surgeon there....,she came from a bourgeois family with a house ...,"[she, came, from, a, bourgeois, family, with, ...",she
75,L198919,u5164,m342,JACK,"no, i, uh, left a dog here this morning. he n...",he needed some work on his mouth.,"[he, needed, some, work, on, his, mouth, .]",he
102,L212226,u5262,m348,JACK,"six months ago, bob's testicles were removed....",he developed bitch tits because his testostero...,"[he, developed, bitch, tits, because, his, tes...",he


In [27]:
movie_lines.describe()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line,Pronoun
count,346,346,346,346,346,346,346,346
unique,285,271,219,244,283,344,342,2
top,L497478,u2688,m437,CYNTHIA,"i was... eight years old. michael green, who ...",what did he say?,"[where, is, he, ?]",he
freq,5,6,7,6,5,2,3,231


In [28]:
movie_lines.shape

(346, 8)

In [33]:
nltk.download('universal_tagset')

def get_verb(tokenized_line):
    pronoun = get_pronoun(tokenized_line)
    pronoun_index = tokenized_line.index(pronoun)
    word_pos_pairs = nltk.pos_tag(tokenized_line[pronoun_index:], tagset='universal')
    pos_list = [pos for _, pos in word_pos_pairs]
    
    if 'VERB' not in pos_list:
        return 'none'
    
    verb_index = pos_list.index('VERB')
    # TODO: consider second verb after pronoun
    # ex: he's biking, 's and biking are both VERB
    return word_pos_pairs[verb_index][0]

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/michellelum/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [34]:
movie_lines["Verb"] = movie_lines["Tokenized_Line"].apply(get_verb)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_lines["Verb"] = movie_lines["Tokenized_Line"].apply(get_verb)


In [35]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line,Pronoun,Verb
22,L380296,u6576,m437,ISABELLA,angola. my mother was being a surgeon there....,she volunteered.,"[she, volunteered, .]",she,volunteered
23,L380296,u6576,m437,ISABELLA,angola. my mother was being a surgeon there....,"mozambique and mali, too... ...those days, in...","[mozambique, and, mali, ,, too, ..., ..., thos...",she,was
24,L380296,u6576,m437,ISABELLA,angola. my mother was being a surgeon there....,she came from a bourgeois family with a house ...,"[she, came, from, a, bourgeois, family, with, ...",she,came
75,L198919,u5164,m342,JACK,"no, i, uh, left a dog here this morning. he n...",he needed some work on his mouth.,"[he, needed, some, work, on, his, mouth, .]",he,needed
102,L212226,u5262,m348,JACK,"six months ago, bob's testicles were removed....",he developed bitch tits because his testostero...,"[he, developed, bitch, tits, because, his, tes...",he,developed


In [36]:
movie_lines.describe()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line,Pronoun,Verb
count,346,346,346,346,346,346,346,346,346
unique,285,271,219,244,283,344,342,2,100
top,L497478,u2688,m437,CYNTHIA,"i was... eight years old. michael green, who ...",what did he say?,"[where, is, he, ?]",he,'s
freq,5,6,7,6,5,2,3,231,84
