In [16]:
import pandas as pd
import ast
import nltk
from nltk.tokenize import word_tokenize

## Function to read data

In [33]:
def process_file(filepath, headers):
    
    '''
    Takes a .txt file from the Cornell Movie Diaglogs Corpus 
    and returns a pandas dataframe
    
    '''
    lines = []
    
    to_longform  = ['text', 'line_id', 'genre']
    idx_col, tgt_col = (headers[:-1], headers[-1])
    
    with open(filepath, 'r', encoding = 'iso-8859-1') as f:
        for line in f:
            cols = line.split(' +++$+++ ')
            lines.append(cols)
            
    df = pd.DataFrame(lines, columns = headers)
    
    if tgt_col in to_longform:
        if tgt_col == 'text':
            df[tgt_col] = df[tgt_col].apply(lambda x: word_tokenize(x))
        else:
            df[tgt_col] = df[tgt_col].apply(lambda x: ast.literal_eval(x))
        
        # Convert wide format column to it's own rows/long form
        df = df.set_index(idx_col)[tgt_col].apply(pd.Series).stack()
        df = df.reset_index()
        
        # Rename columns 
        add_idx_col = [tgt_col+'_idx', tgt_col]
        df.columns = idx_col + add_idx_col
        
    return df

## Read character metadata

In [3]:
headers = ['character_id', 'name', 'movie_id', 'movie_title', 'gender', 'position']

file = 'cornell_movie_dialogs_corpus/movie_characters_metadata.txt'

characters = process_file(file, headers)

In [4]:
characters.head()

Unnamed: 0,character_id,name,movie_id,movie_title,gender,position
0,u0,BIANCA,m0,10 things i hate about you,f,4\n
1,u1,BRUCE,m0,10 things i hate about you,?,?\n
2,u2,CAMERON,m0,10 things i hate about you,m,3\n
3,u3,CHASTITY,m0,10 things i hate about you,?,?\n
4,u4,JOEY,m0,10 things i hate about you,m,6\n


In [5]:
characters.groupby('gender').count()

Unnamed: 0_level_0,character_id,name,movie_id,movie_title,position
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
?,6020,6020,6020,6020,6020
F,45,45,45,45,45
M,150,150,150,150,150
f,921,921,921,921,921
m,1899,1899,1899,1899,1899


## Clean character metadata

In [6]:
characters.loc[characters.gender == 'F', 'gender'] = "f"
characters.loc[characters.gender == 'M', 'gender'] = "m"

In [7]:
characters.groupby('gender').count()

Unnamed: 0_level_0,character_id,name,movie_id,movie_title,position
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
?,6020,6020,6020,6020,6020
f,966,966,966,966,966
m,2049,2049,2049,2049,2049


## Read conversation metadata

In [34]:
file_convo = 'cornell_movie_dialogs_corpus/movie_conversations.txt'

header_convo = ['id_from', 'id_to', 'movie_id', 'line_id']

convo = process_file(file_convo, header_convo)

In [35]:
convo.head()

Unnamed: 0,id_from,id_to,movie_id,line_id_idx,line_id
0,u0,u2,m0,0,L194
1,u0,u2,m0,1,L195
2,u0,u2,m0,2,L196
3,u0,u2,m0,3,L197
4,u0,u2,m0,0,L198


## Read lines metadata

In [38]:
## NOTE: Original text is curently not saved, can modify the process_file if it's needed

file_lines = 'cornell_movie_dialogs_corpus/movie_lines.txt'

header_lines = ['line_id', 'char_id', 'movie_id', 'char_name','text']

lines = process_file(file_lines, header_lines)

In [39]:
lines.head()

Unnamed: 0,line_id,char_id,movie_id,char_name,text_idx,text
0,L1045,u0,m0,BIANCA,0,They
1,L1045,u0,m0,BIANCA,1,do
2,L1045,u0,m0,BIANCA,2,not
3,L1045,u0,m0,BIANCA,3,!
4,L1044,u2,m0,CAMERON,0,They


## Read title metadata

In [41]:
file_titles = 'cornell_movie_dialogs_corpus/movie_titles_metadata.txt'

header_titles = ['movie_id', 'movie_title', 'movie_year', 'imdb_rating', 'imdb_vote', 'genre']

titles = process_file(file_titles, header_titles)

In [42]:
titles.head()

Unnamed: 0,movie_id,movie_title,movie_year,imdb_rating,imdb_vote,genre_idx,genre
0,m0,10 things i hate about you,1999,6.9,62847,0,comedy
1,m0,10 things i hate about you,1999,6.9,62847,1,romance
2,m1,1492: conquest of paradise,1992,6.2,10421,0,adventure
3,m1,1492: conquest of paradise,1992,6.2,10421,1,biography
4,m1,1492: conquest of paradise,1992,6.2,10421,2,drama
