In [1]:
import string
import pandas as pd
import numpy as np
import ast
from timeit import timeit

import multiprocessing as mp

import nltk
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk.corpus import names 
from nltk import wordpunct_tokenize, ngrams
from nltk import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize
from nltk import pos_tag

from sklearn.base import BaseEstimator, TransformerMixin

### REQUIRED (RUN ONCE/DOWNLOAD): word tag file for lemmatization
#nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('names')

## Function to read data

In [2]:
def process_file(filepath, headers):
    
    '''
    Takes a .txt file from the Cornell Movie Diaglogs Corpus 
    and returns a pandas dataframe
    
    '''
    lines = []
    
    to_longform  = ['text', 'line_id', 'genre']
    idx_col, tgt_col = (headers[:-1], headers[-1])
    
    with open(filepath, 'r', encoding = 'iso-8859-1') as f:
        for line in f:
            cols = line.split(' +++$+++ ')
            lines.append(cols)
            
    df = pd.DataFrame(lines, columns = headers)
    
    if tgt_col in to_longform:
        if tgt_col != 'text':
            df[tgt_col] = df[tgt_col].apply(lambda x: ast.literal_eval(x))
            
            # Convert wide format column to its own rows/long form
            df = df.set_index(idx_col)[tgt_col].apply(pd.Series).stack()
            df = df.reset_index()
        
            # Rename columns 
            add_idx_col = [tgt_col+'_idx', tgt_col]
            df.columns = idx_col + add_idx_col
        
        else:
            # Copy full text to another column 
            df['words'] = df[tgt_col]
            idx_col += [tgt_col]
            tgt_col = 'words'
            
            # Tokenize and Lemmatize the lines
            prepos = text_preprocess()
            with mp.Pool() as pool:
                df[tgt_col] = pool.map(prepos.transform, df[tgt_col])
                df[tgt_col] = df[tgt_col].apply(lambda x: ' '.join(x))
            
    return df

In [3]:
# Helper func to read list of pronouns
def read_pronoun(file):
    lines = []
    with open(file, 'r', encoding = 'iso-8859-1') as f:
        for line in f:
            cols = line.replace('\n','').split('\t')
            lines += cols
        return lines 

In [4]:
class text_preprocess(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.pronouns   = read_pronoun('cornell_movie_dialogs_corpus/pronouns.txt')
        # Stopwords DOES NOT filter pronouns (He, she, etc.)
        self.stopwords  = set(sw.words('english')).difference(self.pronouns)
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, sent, y=None):
        return self

    # def inverse_transform(self, sent):
    #    return [" ".join(w) for w in sent]

    def transform(self, sent):
        return list(self.tokenize(sent))
        
    
    def tokenize(self, sent):
        # Break the sentence into part of speech tagged tokens
        # print(pos_tag(wordpunct_tokenize(sent)))
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            # Apply preprocessing to the token
            token = token.lower() if self.lower else token
            token = token.strip() if self.strip else token
            token = token.strip('_') if self.strip else token
            token = token.strip('*') if self.strip else token
            
            # If stopword, ignore token and continue
            if token in self.stopwords:
                continue

            # If punctuation, ignore token and continue
            if all(char in self.punct for char in token):
                continue

            # Lemmatize the token and yield
            lemma = self.lemmatize(token, tag)
            yield lemma
    
    # Takes the word tag to access the appropriate WordNet dict to lemmatize word token,
    # if lemmatized word cannot be found in the folders, funct will treat it as a noun
    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

In [5]:
# Testing the text_preprocess class
test_text = "Harry This ain't ...blonde she babble. his like, boring myself."
a = text_preprocess()
b = a.transform(test_text) # The second elements of the tuple are the tags

In [6]:
b

['harry', 'blonde', 'she', 'babble', 'his', 'like', 'bore', 'myself']

## Read character metadata

In [12]:
headers = ['character_id', 'name', 'movie_id', 'movie_title', 'gender', 'position']

file = 'cornell_movie_dialogs_corpus/movie_characters_metadata.txt'

characters = process_file(file, headers)

In [13]:
characters.head()

Unnamed: 0,character_id,name,movie_id,movie_title,gender,position
0,u0,BIANCA,m0,10 things i hate about you,f,4\n
1,u1,BRUCE,m0,10 things i hate about you,?,?\n
2,u2,CAMERON,m0,10 things i hate about you,m,3\n
3,u3,CHASTITY,m0,10 things i hate about you,?,?\n
4,u4,JOEY,m0,10 things i hate about you,m,6\n


## Clean character metadata

In [17]:
characters.groupby('gender').count()

Unnamed: 0_level_0,character_id,name,movie_id,movie_title,position
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
?,3791,3791,3791,3791,3791
F,45,45,45,45,45
M,150,150,150,150,150
f,1597,1597,1597,1597,1597
m,3452,3452,3452,3452,3452


^ Originally missing gender for 6,020 characters

In [18]:
# Classify and reduce unknown ('?') gender using nltk names corpus 
characters.loc[np.logical_and(characters.name.str.title().isin(names.words('male.txt')), characters.gender == '?'), 'gender'] = 'm'
characters.loc[np.logical_and(characters.name.str.title().isin(names.words('female.txt')), characters.gender == '?'), 'gender'] = 'f'

In [19]:
characters.groupby('gender').count()

Unnamed: 0_level_0,character_id,name,movie_id,movie_title,position
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
?,3791,3791,3791,3791,3791
F,45,45,45,45,45
M,150,150,150,150,150
f,1597,1597,1597,1597,1597
m,3452,3452,3452,3452,3452


In [20]:
characters.loc[characters.gender == 'F', 'gender'] = "f"
characters.loc[characters.gender == 'M', 'gender'] = "m"

In [21]:
characters.groupby('gender').count()

Unnamed: 0_level_0,character_id,name,movie_id,movie_title,position
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
?,3791,3791,3791,3791,3791
f,1642,1642,1642,1642,1642
m,3602,3602,3602,3602,3602


## Read conversation metadata

In [13]:
file_convos = 'cornell_movie_dialogs_corpus/movie_conversations.txt'

header_convos = ['char_id_from', 'char_id_to', 'movie_id', 'line_id']

convos = process_file(file_convos, header_convos)

In [14]:
len(convos)

304713

In [15]:
convos.head()

Unnamed: 0,char_id_from,char_id_to,movie_id,line_id_idx,line_id
0,u0,u2,m0,0,L194
1,u0,u2,m0,1,L195
2,u0,u2,m0,2,L196
3,u0,u2,m0,3,L197
4,u0,u2,m0,0,L198


## Read lines metadata

In [16]:
file_lines = 'cornell_movie_dialogs_corpus/movie_lines.txt'

header_lines = ['line_id', 'char_id', 'movie_id', 'char_name','text']

lines = process_file(file_lines, header_lines)

In [17]:
len(lines)

304713

In [18]:
lines.head()

Unnamed: 0,line_id,char_id,movie_id,char_name,text,words
0,L1045,u0,m0,BIANCA,They do not!\n,they
1,L1044,u2,m0,CAMERON,They do to!\n,they
2,L985,u0,m0,BIANCA,I hope so.\n,i hope
3,L984,u2,m0,CAMERON,She okay?\n,she okay
4,L925,u0,m0,BIANCA,Let's go.\n,let go


## Read title metadata

In [19]:
file_titles = 'cornell_movie_dialogs_corpus/movie_titles_metadata.txt'

header_titles = ['movie_id', 'movie_title', 'movie_year', 'imdb_rating', 'imdb_vote', 'genre']

titles = process_file(file_titles, header_titles)

In [20]:
titles.head()

Unnamed: 0,movie_id,movie_title,movie_year,imdb_rating,imdb_vote,genre_idx,genre
0,m0,10 things i hate about you,1999,6.9,62847,0,comedy
1,m0,10 things i hate about you,1999,6.9,62847,1,romance
2,m1,1492: conquest of paradise,1992,6.2,10421,0,adventure
3,m1,1492: conquest of paradise,1992,6.2,10421,1,biography
4,m1,1492: conquest of paradise,1992,6.2,10421,2,drama


# Joining tables

### Joining character metadata and conversations

In [21]:
char_cols = ['character_id', 'movie_id', 'gender']
conv_cols = ['char_id_from', 'char_id_to', 'line_id']

convo_gender = pd.merge(characters[char_cols], convos[conv_cols], left_on = 'character_id', right_on = 'char_id_from')
convo_gender.rename(columns={'gender':'gender_from'}, inplace=True)

convo_gender = pd.merge(characters[['character_id', 'gender']], convo_gender, left_on = 'character_id', right_on = 'char_id_to')
convo_gender.rename(columns={'gender':'gender_to'}, inplace=True)
convo_gender.drop(['character_id_x', 'character_id_y'], axis=1, inplace = True)

### Joining with words

In [22]:
line_cols = ['line_id', 'words']

convo_gender_words = pd.merge(convo_gender, lines[line_cols], on = 'line_id')

### Join with movie metadata

In [23]:
title_cols = ['movie_id', 'movie_year', 'genre']

#use the first genre only
main_genre = titles[titles.genre_idx == 0]

movies = pd.merge(convo_gender_words, main_genre[title_cols], on = 'movie_id')

In [24]:
movies.head(10)

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre
0,m,m0,f,u0,u2,L194,we make quick roxanne korrine andrew barrett i...,1999,comedy
1,m,m0,f,u0,u2,L195,well i think we start pronunciation okay you,1999,comedy
2,m,m0,f,u0,u2,L196,hacking gagging spit part please,1999,comedy
3,m,m0,f,u0,u2,L197,okay bout we try french cuisine saturday night,1999,comedy
4,m,m0,f,u0,u2,L198,you ask me cute your name,1999,comedy
5,m,m0,f,u0,u2,L199,forget it,1999,comedy
6,m,m0,f,u0,u2,L200,it my fault we proper introduction,1999,comedy
7,m,m0,f,u0,u2,L201,cameron,1999,comedy
8,m,m0,f,u0,u2,L202,thing cameron i mercy particularly hideous bre...,1999,comedy
9,m,m0,f,u0,u2,L203,seems like she could get date easy enough,1999,comedy


### Pickle final table

In [25]:
import pickle

pickle.dump(movies, open('movies.p', 'wb'))

df = pickle.load(open("movies.p", 'rb'))

In [26]:
movies_small = movies[:1700000]

pickle.dump(movies_small, open('data/movies_small.p', 'wb'))

In [27]:
df.head()

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre
0,m,m0,f,u0,u2,L194,we make quick roxanne korrine andrew barrett i...,1999,comedy
1,m,m0,f,u0,u2,L195,well i think we start pronunciation okay you,1999,comedy
2,m,m0,f,u0,u2,L196,hacking gagging spit part please,1999,comedy
3,m,m0,f,u0,u2,L197,okay bout we try french cuisine saturday night,1999,comedy
4,m,m0,f,u0,u2,L198,you ask me cute your name,1999,comedy
