In [1]:
import string
import pandas as pd
import numpy as np
import ast
from timeit import timeit

import multiprocessing as mp

import nltk
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk.corpus import names 
from nltk import wordpunct_tokenize, ngrams
from nltk import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize
from nltk import pos_tag

from sklearn.base import BaseEstimator, TransformerMixin

### REQUIRED (RUN ONCE/DOWNLOAD): word tag file for lemmatization
#nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('names')

## Function to read data

In [2]:
def process_file(filepath, headers):
    
    '''
    Takes a .txt file from the Cornell Movie Diaglogs Corpus 
    and returns a pandas dataframe
    
    '''
    lines = []
    
    to_longform  = ['text', 'line_id', 'genre']
    idx_col, tgt_col = (headers[:-1], headers[-1])
    
    with open(filepath, 'r', encoding = 'iso-8859-1') as f:
        for line in f:
            cols = line.split(' +++$+++ ')
            lines.append(cols)
            
    df = pd.DataFrame(lines, columns = headers)
    
    if tgt_col in to_longform:
        if tgt_col != 'text':
            df[tgt_col] = df[tgt_col].apply(lambda x: ast.literal_eval(x))
            
            # Convert wide format column to its own rows/long form
            df = df.set_index(idx_col)[tgt_col].apply(pd.Series).stack()
            df = df.reset_index()
        
            # Rename columns 
            add_idx_col = [tgt_col+'_idx', tgt_col]
            df.columns = idx_col + add_idx_col
        
        else:
            # Copy full text to another column 
            df['words'] = df[tgt_col]
            idx_col += [tgt_col]
            tgt_col = 'words'
            
            # Tokenize and Lemmatize the lines
            prepos = text_preprocess()
            with mp.Pool() as pool:
                df[tgt_col] = pool.map(prepos.transform, df[tgt_col])
                df[tgt_col] = df[tgt_col].apply(lambda x: ' '.join(x))
            
    return df

In [3]:
# Helper func to read list of pronouns
def read_pronoun(file):
    lines = []
    with open(file, 'r', encoding = 'iso-8859-1') as f:
        for line in f:
            cols = line.replace('\n','').split('\t')
            lines += cols
        return lines 

In [4]:
class text_preprocess(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True, with_pro=True):
        self.lower      = lower
        self.strip      = strip
        self.with_pro   = with_pro
        self.pronouns   = read_pronoun('../cornell_movie_dialogs_corpus/pronouns.txt')
        # If pron=True stopwords DOES NOT include pronouns (i.e., pronouns are included in corpus) 
        self.stopwords  = [set(sw.words('english')).difference(self.pronouns) if self.with_pro else set(sw.words('english'))][0]
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, sent, y=None):
        return self

    # def inverse_transform(self, sent):
    #    return [" ".join(w) for w in sent]

    def transform(self, sent):
        return list(self.tokenize(sent))
        
    
    def tokenize(self, sent):
        # Break the sentence into part of speech tagged tokens
        # print(pos_tag(wordpunct_tokenize(sent)))
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            # Apply preprocessing to the token
            token = token.lower() if self.lower else token
            token = token.strip() if self.strip else token
            token = token.strip('_') if self.strip else token
            token = token.strip('*') if self.strip else token
            
            # If stopword, ignore token and continue
            if token in self.stopwords:
                continue

            # If punctuation, ignore token and continue
            if all(char in self.punct for char in token):
                continue

            # Lemmatize the token and yield
            lemma = self.lemmatize(token, tag)
            yield lemma
    
    # Takes the word tag to access the appropriate WordNet dict to lemmatize word token,
    # if lemmatized word cannot be found in the folders, funct will treat it as a noun
    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

In [5]:
# Testing the text_preprocess class
test_text = "Harry This ain't ...blonde she babble. his like, boring myself."
a = text_preprocess()
b = a.transform(test_text) # The second elements of the tuple are the tags

In [6]:
b

['harry', 'blonde', 'she', 'babble', 'his', 'like', 'bore', 'myself']

## Read character metadata

In [7]:
headers = ['character_id', 'name', 'movie_id', 'movie_title', 'gender', 'position']

file = '../cornell_movie_dialogs_corpus/movie_characters_metadata.txt'

characters = process_file(file, headers)

In [8]:
characters.head()

Unnamed: 0,character_id,name,movie_id,movie_title,gender,position
0,u0,BIANCA,m0,10 things i hate about you,f,4\n
1,u1,BRUCE,m0,10 things i hate about you,?,?\n
2,u2,CAMERON,m0,10 things i hate about you,m,3\n
3,u3,CHASTITY,m0,10 things i hate about you,?,?\n
4,u4,JOEY,m0,10 things i hate about you,m,6\n


## Clean character metadata

In [9]:
characters.groupby('gender').count()

Unnamed: 0_level_0,character_id,name,movie_id,movie_title,position
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
?,6020,6020,6020,6020,6020
F,45,45,45,45,45
M,150,150,150,150,150
f,921,921,921,921,921
m,1899,1899,1899,1899,1899


^ Originally missing gender for 6,020 characters

In [10]:
import nltk
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\JoanWang\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\names.zip.


True

In [11]:
# Classify and reduce unknown ('?') gender using nltk names corpus 
characters.loc[np.logical_and(characters.name.str.title().isin(names.words('male.txt')), characters.gender == '?'), 'gender'] = 'm'
characters.loc[np.logical_and(characters.name.str.title().isin(names.words('female.txt')), characters.gender == '?'), 'gender'] = 'f'

In [12]:
characters.groupby('gender').count()

Unnamed: 0_level_0,character_id,name,movie_id,movie_title,position
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
?,3791,3791,3791,3791,3791
F,45,45,45,45,45
M,150,150,150,150,150
f,1597,1597,1597,1597,1597
m,3452,3452,3452,3452,3452


In [13]:
characters.loc[characters.gender == 'F', 'gender'] = "f"
characters.loc[characters.gender == 'M', 'gender'] = "m"

In [14]:
characters.groupby('gender').count()

Unnamed: 0_level_0,character_id,name,movie_id,movie_title,position
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
?,3791,3791,3791,3791,3791
f,1642,1642,1642,1642,1642
m,3602,3602,3602,3602,3602


## Read conversation metadata

In [15]:
file_convos = '../cornell_movie_dialogs_corpus/movie_conversations.txt'

header_convos = ['char_id_from', 'char_id_to', 'movie_id', 'line_id']

convos = process_file(file_convos, header_convos)

In [16]:
len(convos)

304713

In [None]:
convos.head()

## Read lines metadata

In [None]:
file_lines = '../cornell_movie_dialogs_corpus/movie_lines.txt'

header_lines = ['line_id', 'char_id', 'movie_id', 'char_name','text']

lines = process_file(file_lines, header_lines)

In [None]:
len(lines)

In [None]:
lines.head()

## Read title metadata

In [None]:
file_titles = '../cornell_movie_dialogs_corpus/movie_titles_metadata.txt'

header_titles = ['movie_id', 'movie_title', 'movie_year', 'imdb_rating', 'imdb_vote', 'genre']

titles = process_file(file_titles, header_titles)

In [None]:
titles.head()

# Joining tables

### Joining character metadata and conversations

In [None]:
char_cols = ['character_id', 'movie_id', 'gender']
conv_cols = ['char_id_from', 'char_id_to', 'line_id']

convo_gender = pd.merge(characters[char_cols], convos[conv_cols], left_on = 'character_id', right_on = 'char_id_from')
convo_gender.rename(columns={'gender':'gender_from'}, inplace=True)

convo_gender = pd.merge(characters[['character_id', 'gender']], convo_gender, left_on = 'character_id', right_on = 'char_id_to')
convo_gender.rename(columns={'gender':'gender_to'}, inplace=True)
convo_gender.drop(['character_id_x', 'character_id_y'], axis=1, inplace = True)

### Joining with words

In [None]:
line_cols = ['line_id', 'words']

convo_gender_words = pd.merge(convo_gender, lines[line_cols], on = 'line_id')

### Join with movie metadata

In [None]:
title_cols = ['movie_id', 'movie_year', 'genre']

#use the first genre only
main_genre = titles[titles.genre_idx == 0]

movies = pd.merge(convo_gender_words, main_genre[title_cols], on = 'movie_id')

In [None]:
movies.head(10)

### Pickle final table

In [3]:
import pickle

#pickle.dump(movies, open('../data/movies.p', 'wb'))
df = pickle.load(open("../data/movies.p", 'rb'))

In [None]:
#movies_small = movies[:1700000]

#pickle.dump(movies, open('data/movies.p', 'wb'))

### Split data into training and holdout
All work will be done on training set  
Analysis at the end, after modeling and scoring mechanism is built, will be performed on holdout set

In [26]:
df.shape

(304354, 9)

In [47]:
from sklearn.model_selection import train_test_split

unique_movies = pd.DataFrame(df.movie_id.unique(), columns = ['movie_id'])
movies_train, movies_holdout = train_test_split(unique_movies, test_size=0.33, random_state = 0)

In [48]:
movies_train.shape

(412, 1)

In [49]:
movies_holdout.shape

(204, 1)

In [52]:
movies_lines_train = movies_train.merge(df, how='left', on='movie_id')
movies_lines_train.shape

(202394, 9)

In [53]:
movies_lines_holdout = movies_holdout.merge(df, how='left', on='movie_id')
movies_lines_holdout.shape

(101960, 9)

In [54]:
pickle.dump(movies_train, open('../data/movies_train.p', 'wb'))
pickle.dump(movies_test, open('../data/movies_holdout.p', 'wb'))

pickle.dump(movies_lines_train, open('../data/movies_lines_train.p', 'wb'))
pickle.dump(movies_test, open('../data/movies_lines_holdout.p', 'wb'))