In [16]:
import pandas as pd
import ast
import nltk
from nltk.tokenize import word_tokenize

## Function to read data

In [2]:
def process_file(filepath, headers):
    
    '''
    Takes a .txt file from the Cornell Movie Diaglogs Corpus 
    and returns a pandas dataframe
    
    '''

    lines = []

    with open(filepath, 'r', encoding = 'iso-8859-1') as f:
        for line in f:
            cols = line.split(' +++$+++ ')
            lines.append(cols)
            
    df = pd.DataFrame(lines, columns = headers)
    
    return df

## Read character metadata

In [3]:
headers = ['character_id', 'name', 'movie_id', 'movie_title', 'gender', 'position']

file = 'cornell_movie_dialogs_corpus/movie_characters_metadata.txt'

characters = process_file(file, headers)

In [4]:
characters.head()

Unnamed: 0,character_id,name,movie_id,movie_title,gender,position
0,u0,BIANCA,m0,10 things i hate about you,f,4\n
1,u1,BRUCE,m0,10 things i hate about you,?,?\n
2,u2,CAMERON,m0,10 things i hate about you,m,3\n
3,u3,CHASTITY,m0,10 things i hate about you,?,?\n
4,u4,JOEY,m0,10 things i hate about you,m,6\n


In [5]:
characters.groupby('gender').count()

Unnamed: 0_level_0,character_id,name,movie_id,movie_title,position
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
?,6020,6020,6020,6020,6020
F,45,45,45,45,45
M,150,150,150,150,150
f,921,921,921,921,921
m,1899,1899,1899,1899,1899


## Clean character metadata

In [6]:
characters.loc[characters.gender == 'F', 'gender'] = "f"
characters.loc[characters.gender == 'M', 'gender'] = "m"

In [7]:
characters.groupby('gender').count()

Unnamed: 0_level_0,character_id,name,movie_id,movie_title,position
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
?,6020,6020,6020,6020,6020
f,966,966,966,966,966
m,2049,2049,2049,2049,2049


## Read conversation metadata

In [8]:
file_convo = 'cornell_movie_dialogs_corpus/movie_conversations.txt'

header_convo = ['id_from', 'id_to', 'movie_id', 'turns']

convo = process_file(file_convo, header_convo)

In [9]:
convo.head()

Unnamed: 0,id_from,id_to,movie_id,turns
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']\n"
1,u0,u2,m0,"['L198', 'L199']\n"
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']\n"
3,u0,u2,m0,"['L204', 'L205', 'L206']\n"
4,u0,u2,m0,"['L207', 'L208']\n"


## Cleaning conversation metadata

In [12]:
# Convert 'turns' column from string into list 
convo['turns'] = convo['turns'].apply(lambda x: ast.literal_eval(x))

# Convert 'turn' column from wide to long format
convo = convo.set_index(['id_from', 'id_to', 'movie_id'])['turns'].apply(pd.Series).stack()
convo = convo.reset_index()
convo.columns = ['id_from','id_to','movie_id', 'turn','line_id']

In [13]:
convo.head()

Unnamed: 0,id_from,id_to,movie_id,turn,line_id
0,u0,u2,m0,0,L194
1,u0,u2,m0,1,L195
2,u0,u2,m0,2,L196
3,u0,u2,m0,3,L197
4,u0,u2,m0,0,L198


## Read lines metadata

In [19]:
file_lines = 'cornell_movie_dialogs_corpus/movie_lines.txt'

header_lines = ['line_id', 'char_id', 'movie_id', 'char_name','text']

lines = process_file(file_lines, header_lines)

In [15]:
lines.head()

Unnamed: 0,line_id,char_id,movie_id,char_name,text
0,L1045,u0,m0,BIANCA,They do not!\n
1,L1044,u2,m0,CAMERON,They do to!\n
2,L985,u0,m0,BIANCA,I hope so.\n
3,L984,u2,m0,CAMERON,She okay?\n
4,L925,u0,m0,BIANCA,Let's go.\n


In [20]:
# Tokenize the words from each text
lines['words'] = lines['text'].apply(lambda x: word_tokenize(x))

In [22]:
lines = lines.set_index(['line_id', 'char_id', 'movie_id', 'char_name', 'text'])['words'].apply(pd.Series).stack()
lines = lines.reset_index()
lines.columns = ['line_id', 'char_id', 'movie_id', 'char_name', 'text', 'word_pos', 'word']

In [23]:
lines.head()

Unnamed: 0,line_id,char_id,movie_id,char_name,text,word_pos,word
0,L1045,u0,m0,BIANCA,They do not!\n,0,They
1,L1045,u0,m0,BIANCA,They do not!\n,1,do
2,L1045,u0,m0,BIANCA,They do not!\n,2,not
3,L1045,u0,m0,BIANCA,They do not!\n,3,!
4,L1044,u2,m0,CAMERON,They do to!\n,0,They
