In [1]:
import pandas as pd

Source for how to import data:  
https://www.kaggle.com/shashankasubrahmanya/preprocessing-cornell-movie-dialogue-corpus

In [2]:
movie_lines_features = ["LineID", "Character", "Movie", "Name", "Line"]


In [3]:
movie_lines = pd.read_csv("data/movie_lines.txt", sep = "\+\+\+\$\+\+\+", engine = "python", encoding='ISO-8859-1', index_col = False, names = movie_lines_features)


In [4]:
movie_lines = movie_lines.sample(frac=0.01)

In [5]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
141975,L95820,u4136,m276,GEORGE,But this is...
151314,L125213,u4404,m292,DIL,"Apologies, my sweet."
167184,L161309,u4816,m321,GALLAGHER,Fill it out. Standard issue mal- practice in...
215310,L356770,u6378,m425,BOB,"Ok, whatever you think... I'm lost."
124737,L33855,u3652,m240,STIFLER,"SUCK ME, BEAUTIFUL!"


In [6]:
# Strip the space from "LineID" for further usage and change the datatype of "Line"
movie_lines["LineID"] = movie_lines["LineID"].apply(str.strip)
movie_lines["Line"] = movie_lines["Line"].apply(str)
movie_lines["Line"] = movie_lines["Line"].apply(str.lower)

In [7]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
141975,L95820,u4136,m276,GEORGE,but this is...
151314,L125213,u4404,m292,DIL,"apologies, my sweet."
167184,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...
215310,L356770,u6378,m425,BOB,"ok, whatever you think... i'm lost."
124737,L33855,u3652,m240,STIFLER,"suck me, beautiful!"


In [8]:
# # Remove lines that do not contain he or she pronouns
# # Only include lines where ' he ' is in middle of sentence, surrounded by at least 1 space on either side
# movie_lines = movie_lines[movie_lines["Line"].str.contains('|'.join([' he ', ' she ']))]

In [9]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
141975,L95820,u4136,m276,GEORGE,but this is...
151314,L125213,u4404,m292,DIL,"apologies, my sweet."
167184,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...
215310,L356770,u6378,m425,BOB,"ok, whatever you think... i'm lost."
124737,L33855,u3652,m240,STIFLER,"suck me, beautiful!"


In [10]:
movie_lines.iloc[[15]]

Unnamed: 0,LineID,Character,Movie,Name,Line
21720,L122522,u676,m43,CHUCK,"may i ask, where are you bound?"


In [11]:
# Tokenize and segment
# Drop rows without pronouns
import nltk
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/owner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
movie_lines["Segmented_Line"] = movie_lines["Line"].apply(sent_tokenize)

In [13]:
df = pd.DataFrame(columns=movie_lines.columns)

In [14]:
df.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line


In [15]:
for row in movie_lines.iterrows():
    for sentence in row[1]["Segmented_Line"]:
        line_id = row[1]["LineID"]
        character = row[1]["Character"]
        movie = row[1]["Movie"]
        name = row[1]["Name"]
        line = row[1]["Line"]
        segmented = sentence
        new_row = {"LineID":line_id, "Character": character,
                   "Movie":movie,"Name":name,"Line":line,
                   "Segmented_Line":segmented}
        df = df.append(new_row, ignore_index=True)

In [16]:
df.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line
0,L95820,u4136,m276,GEORGE,but this is...,but this is...
1,L125213,u4404,m292,DIL,"apologies, my sweet.","apologies, my sweet."
2,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...,fill it out.
3,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...,standard issue mal- practice insurance.
4,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...,all cops carry it.


In [17]:
movie_lines = df

In [18]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line
0,L95820,u4136,m276,GEORGE,but this is...,but this is...
1,L125213,u4404,m292,DIL,"apologies, my sweet.","apologies, my sweet."
2,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...,fill it out.
3,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...,standard issue mal- practice insurance.
4,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...,all cops carry it.


In [19]:
# def tokenize_sentence_list(sentence_list):
#     return [word_tokenize(sentence) for sentence in sentence_list]

In [20]:
movie_lines["Tokenized_Line"] = movie_lines["Segmented_Line"].apply(word_tokenize)


In [21]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line
0,L95820,u4136,m276,GEORGE,but this is...,but this is...,"[but, this, is, ...]"
1,L125213,u4404,m292,DIL,"apologies, my sweet.","apologies, my sweet.","[apologies, ,, my, sweet, .]"
2,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...,fill it out.,"[fill, it, out, .]"
3,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...,standard issue mal- practice insurance.,"[standard, issue, mal-, practice, insurance, .]"
4,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...,all cops carry it.,"[all, cops, carry, it, .]"


In [22]:
def get_pronoun(tokenized_line):
    if "she" in tokenized_line:
        return "she"
    elif "he" in tokenized_line:
        return "he"
    else:
        return "none"

In [23]:
movie_lines["Pronoun"] = movie_lines["Tokenized_Line"].apply(get_pronoun)

In [24]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line,Pronoun
0,L95820,u4136,m276,GEORGE,but this is...,but this is...,"[but, this, is, ...]",none
1,L125213,u4404,m292,DIL,"apologies, my sweet.","apologies, my sweet.","[apologies, ,, my, sweet, .]",none
2,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...,fill it out.,"[fill, it, out, .]",none
3,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...,standard issue mal- practice insurance.,"[standard, issue, mal-, practice, insurance, .]",none
4,L161309,u4816,m321,GALLAGHER,fill it out. standard issue mal- practice in...,all cops carry it.,"[all, cops, carry, it, .]",none


In [25]:
movie_lines = movie_lines.loc[movie_lines["Pronoun"] != "none"]

In [26]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line,Pronoun
45,L307448,u1427,m96,DAD,what did he talk to you about?,what did he talk to you about?,"[what, did, he, talk, to, you, about, ?]",he
52,L340500,u6258,m418,KELLY,"in his defense, every primitive culture known...","he's a mythology professor, he thinks crocs ar...","[he, 's, a, mythology, professor, ,, he, think...",he
58,L431457,u6953,m465,EDIE,he was looking at everybody the same way. ask...,he was looking at everybody the same way.,"[he, was, looking, at, everybody, the, same, w...",he
75,L623016,u8573,m581,SETH,she's not at that place you sent her?,she's not at that place you sent her?,"[she, 's, not, at, that, place, you, sent, her...",she
102,L412378,u2256,m145,DAY-DAY,he's inside the house?,he's inside the house?,"[he, 's, inside, the, house, ?]",he


In [27]:
movie_lines.describe()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line,Tokenized_Line,Pronoun
count,363,363,363,363,363,363,363,363
unique,297,278,224,247,297,363,362,2
top,L400191,u516,m32,RONNIE,there's this kid at school... martin brockett...,what did he talk to you about?,"[he, 's, <, u, >, dead, <, /u, >, .]",he
freq,4,6,6,6,4,1,2,241


In [28]:
movie_lines.shape

(363, 8)

In [29]:
nltk.download('universal_tagset')
pos = [pos for word, pos in nltk.pos_tag(['hi', 'I'], tagset='universal')]

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/owner/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [30]:
pos

['NOUN', 'PRON']