# Move Dialogue: Base Task

## Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import math
nltk.download('punkt')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nicoespinosadice/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/nicoespinosadice/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nicoespinosadice/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nicoespinosadice/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Importing Dataset

Source for how to import data:  
https://www.kaggle.com/shashankasubrahmanya/preprocessing-cornell-movie-dialogue-corpus

In [2]:
movie_lines_features = ["LineID", "Character", "Movie", "Name", "Line"]


In [3]:
movie_lines = pd.read_csv("data/movie_lines.txt", sep = "\+\+\+\$\+\+\+", engine = "python", encoding='ISO-8859-1', index_col = False, names = movie_lines_features)


In [4]:
movie_titles_features = ["ID", "Title", "Year", "Rating", "No. of Votes", "Genre"]


In [5]:
movie_titles = pd.read_csv("data/movie_titles_metadata.txt", sep = "\+\+\+\$\+\+\+", engine = "python", encoding='ISO-8859-1', index_col = False, names = movie_titles_features)


In [6]:
movie_titles.head()

Unnamed: 0,ID,Title,Year,Rating,No. of Votes,Genre
0,m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
1,m1,1492: conquest of paradise,1992,6.2,10421,"['adventure', 'biography', 'drama', 'history']"
2,m2,15 minutes,2001,6.1,25854,"['action', 'crime', 'drama', 'thriller']"
3,m3,2001: a space odyssey,1968,8.4,163227,"['adventure', 'mystery', 'sci-fi']"
4,m4,48 hrs.,1982,6.9,22289,"['action', 'comedy', 'crime', 'drama', 'thril..."


## Data Cleaning

In [7]:
movie_lines = movie_lines.sample(frac=0.5)

In [8]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
162735,L149524,u4689,m311,ALBRECHT,"You're bleeding, man. You can't make it."
134272,L68618,u3904,m258,MARY BOONE,"You have a bunch of them, don't you? Albert M..."
17616,L97469,u574,m36,JEFFREY,Here's to Ben.
95963,L516162,u2837,m186,MECKLEN,"Buddy, they revoked the deal, they pulled it..."
181843,L212421,u5262,m348,JACK,"You see, when you travel, everything is small..."


In [9]:
# Strip the space from "LineID" for further usage and change the datatype of "Line"
movie_lines["LineID"] = movie_lines["LineID"].apply(str.strip)
movie_lines["Movie"] = movie_lines["Movie"].apply(str.strip)
movie_lines["Line"] = movie_lines["Line"].apply(str)
movie_lines["Line"] = movie_lines["Line"].apply(str.lower)

In [10]:
def parse_year(year_str):
    new = str.strip(year_str)
    new = year_str[:5]
    return int(new)

In [11]:
movie_titles["ID"] = movie_titles["ID"].apply(str.strip)
movie_titles["Year"] = movie_titles["Year"].apply(parse_year)

In [12]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
162735,L149524,u4689,m311,ALBRECHT,"you're bleeding, man. you can't make it."
134272,L68618,u3904,m258,MARY BOONE,"you have a bunch of them, don't you? albert m..."
17616,L97469,u574,m36,JEFFREY,here's to ben.
95963,L516162,u2837,m186,MECKLEN,"buddy, they revoked the deal, they pulled it..."
181843,L212421,u5262,m348,JACK,"you see, when you travel, everything is small..."


In [13]:
# # Remove lines that do not contain he or she pronouns
# # Only include lines where ' he ' is in middle of sentence, surrounded by at least 1 space on either side
# movie_lines = movie_lines[movie_lines["Line"].str.contains('|'.join([' he ', ' she ']))]

In [14]:
movie_lines.head()

Unnamed: 0,LineID,Character,Movie,Name,Line
162735,L149524,u4689,m311,ALBRECHT,"you're bleeding, man. you can't make it."
134272,L68618,u3904,m258,MARY BOONE,"you have a bunch of them, don't you? albert m..."
17616,L97469,u574,m36,JEFFREY,here's to ben.
95963,L516162,u2837,m186,MECKLEN,"buddy, they revoked the deal, they pulled it..."
181843,L212421,u5262,m348,JACK,"you see, when you travel, everything is small..."


### Tokenization and Segmentation

In [15]:
# Tokenize and segment
# Drop rows without pronouns

In [16]:
movie_lines["Segmented_Line"] = movie_lines["Line"].apply(sent_tokenize)

In [17]:
df = pd.DataFrame(columns=movie_lines.columns)

In [18]:
df.head()

Unnamed: 0,LineID,Character,Movie,Name,Line,Segmented_Line


In [None]:
for row in movie_lines.iterrows():
    for sentence in row[1]["Segmented_Line"]:
        line_id = row[1]["LineID"]
        character = row[1]["Character"]
        movie = row[1]["Movie"]
        name = row[1]["Name"]
        line = row[1]["Line"]
        segmented = sentence
        new_row = {"LineID":line_id, "Character": character,
                   "Movie":movie,"Name":name,"Line":line,
                   "Segmented_Line":segmented}
        df = df.append(new_row, ignore_index=True)

In [None]:
df.head()

In [None]:
movie_lines = df

In [None]:
movie_lines.head()

In [None]:
movie_lines["Tokenized_Line"] = movie_lines["Segmented_Line"].apply(word_tokenize)


In [None]:
movie_lines.head()

### Extracting Pronouns

In [None]:
def get_pronoun(tokenized_line):
    if "she" in tokenized_line:
        return "she"
    elif "he" in tokenized_line:
        return "he"
    else:
        return "none"

In [None]:
movie_lines["Pronoun"] = movie_lines["Tokenized_Line"].apply(get_pronoun)

In [None]:
movie_lines.head()

In [None]:
movie_lines = movie_lines.loc[movie_lines["Pronoun"] != "none"]

In [None]:
movie_lines.head()

In [None]:
movie_lines.describe()

In [None]:
movie_lines.shape

### Extracting Verb

In [None]:
def get_verb(tokenized_line):
    pronoun = get_pronoun(tokenized_line)
    pronoun_index = tokenized_line.index(pronoun)
    word_pos_pairs = nltk.pos_tag(tokenized_line[pronoun_index:], tagset='universal')
    pos_list = [pos for _, pos in word_pos_pairs]
    
    if 'VERB' not in pos_list:
        return 'none'
    
    verb_index = pos_list.index('VERB')
    # TODO: consider second verb after pronoun
    # ex: he's biking, 's and biking are both VERB
    return word_pos_pairs[verb_index][0]

In [None]:
movie_lines["Verb"] = movie_lines["Tokenized_Line"].apply(get_verb)

In [None]:
movie_lines.head()

In [None]:
movie_lines.describe()

### Lemmatization

In [None]:
def get_lemmatized_word(word):
    return lemmatizer.lemmatize(word, "v")

In [None]:
# LEMMATIZATION

lemmatizer = WordNetLemmatizer()

movie_lines["Verb"] = movie_lines["Verb"].apply(get_lemmatized_word)

## Extracting Year

In [None]:
def get_movie_year(movie_id):
    return movie_titles.loc[movie_titles["ID"] == movie_id].iloc[0]["Year"]

In [None]:
movie_lines["Year"] = movie_lines["Movie"].apply(get_movie_year)

In [None]:
movie_lines.head()

In [None]:
movie_lines["Year"].describe()

## Extracting Genre

In [None]:
def get_genre(movie_id):
    genre_str = movie_titles.loc[movie_titles["ID"] == movie_id].iloc[0]["Genre"]
    return genre_str.strip().replace("'", "").strip('[]').split(", ")

In [None]:
movie_lines["Genre"] = movie_lines["Movie"].apply(get_genre)

In [None]:
movie_lines["Genre"].describe() 

# DECADE ANALYSIS

## Counts

In [None]:
def get_df_decade(decade):
    df = movie_lines.loc[movie_lines["Year"] > decade]
    df = df.loc[df["Year"] < decade + 10]
    return df

In [None]:
def get_df_genre(genre):
    df = pd.DataFrame(columns=movie_lines.columns)
    
    for row in movie_lines.iterrows():
        if genre in row[1]["Genre"]:
            df = df.append(row[1], ignore_index=True)
    return df

In [None]:
def get_counts(df):
    counts = Counter(list(df["Verb"].values))
    hes = df.loc[df["Pronoun"] == "he"]
    num_hes = hes.shape[0]
    shes = df.loc[df["Pronoun"] == "she"]
    num_shes = shes.shape[0]
    he_list = list(hes["Verb"].values)
    she_list = list(shes["Verb"].values)
    he_counts = Counter(he_list)
    she_counts = Counter(she_list)
    return counts, he_counts, she_counts, num_hes, num_shes

In [None]:
# counts, he_counts, she_counts, num_hes, num_shes = get_counts(movie_lines)

#### Laplace Smoothing

In [None]:
def laplace(counts, he_counts, she_counts):
    # Laplace smoothing

    for item in counts:
        he_counts[item] += 1

    for item in counts:
        she_counts[item] += 1
        
    return he_counts, she_counts

In [None]:
# he_counts, she_counts = laplace(counts, he_counts, she_counts)

#### Relative Frequency

In [None]:
def get_freqs(counts, he_counts, she_counts, num_hes, num_shes, num_verbs_plot):
    verbs = list(counts.keys())

    he_freqs = []
    she_freqs = []
    freqs = []
    
    for verb in verbs:
        he_rel_freq = he_counts[verb] / num_hes
        she_rel_freq = she_counts[verb] / num_shes
        he_freqs.append(he_rel_freq / she_rel_freq)
        she_freqs.append(she_rel_freq / he_rel_freq)
        if he_rel_freq > she_rel_freq:
            freq = (he_rel_freq / she_rel_freq) * -1
            freq += 1
            freqs.append(freq)
        else:
            freq = (she_rel_freq / he_rel_freq)
            freq += -1
            freqs.append(freq)

    verb_freqs = {verb: freq for (verb, freq) in list(zip(verbs, freqs))}
    he_verb_freqs = {verb: freq for (verb, freq) in list(zip(verbs, he_freqs))}
    she_verb_freqs = {verb: freq for (verb, freq) in list(zip(verbs, she_freqs))}
    
    he_freqs_counter = Counter(he_verb_freqs)
    she_freqs_counter = Counter(she_verb_freqs)
    
    he_verbs = [verb for (verb, _) in he_freqs_counter.most_common(num_verbs_plot)]
    she_verbs = [verb for (verb, _) in she_freqs_counter.most_common(num_verbs_plot)]
    he_freqs = [verb_freqs[verb] for verb in he_verbs]
    she_freqs = [verb_freqs[verb] for verb in she_verbs]
    
    return he_verbs, she_verbs, he_freqs, she_freqs

In [None]:
# he_verbs, she_verbs, he_freqs, she_freqs = get_freqs(counts, he_counts, she_counts, num_hes, num_shes)

#### Plotting

In [None]:
def plot(he_verbs, she_verbs, he_freqs, she_freqs, num_verbs_plot, flex):
    verbs = he_verbs + she_verbs
    freqs = he_freqs + she_freqs
    
    plt.rcdefaults()
    fig, ax = plt.subplots()

    y_pos = np.arange(len(verbs))
    ax.barh(y_pos, freqs, align="center")
    ax.set_yticks(y_pos, labels=verbs)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Relative Frequency Ratio (He <0; She >0) ')
    graph_title = 'Rel. Freq. Ratio of Verbs Following Pronouns {She, He} for Decade ' + str(flex)
    ax.set_title(graph_title)

    title = str(flex) + ".png"
    plt.savefig(title)

## Decades

In [None]:
movie_lines["Year"].min(), movie_lines["Year"].max()

In [None]:
decades = []
for i in range(1930, 2010, 10):
    decades.append(i)

In [None]:
for decade in decades:
    num_verbs_plot = 15
    
    df = get_df_decade(decade)
    
    counts, he_counts, she_counts, num_hes, num_shes = get_counts(df)
    
    he_counts, she_counts = laplace(counts, he_counts, she_counts)
    
    he_verbs, she_verbs, he_freqs, she_freqs = get_freqs(counts, he_counts, she_counts, num_hes, num_shes, num_verbs_plot)
    
    plot(he_verbs, she_verbs, he_freqs, she_freqs, num_verbs_plot, decade)

## Genre

In [None]:
genres = list(movie_lines["Genre"].values)

In [None]:
genres = list(set([genre for genre_list in genres for genre in genre_list]))

In [None]:
for genre in genres:
    num_verbs_plot = 15
    
    df = get_df_genre(genre)
    
    counts, he_counts, she_counts, num_hes, num_shes = get_counts(df)
    if num_hes == 0:
        num_hes += 1
    if num_shes == 0:
        num_shes += 1
    
    he_counts, she_counts = laplace(counts, he_counts, she_counts)
    
    he_verbs, she_verbs, he_freqs, she_freqs = get_freqs(counts, he_counts, she_counts, num_hes, num_shes, num_verbs_plot)
    
    plot(he_verbs, she_verbs, he_freqs, she_freqs, num_verbs_plot, genre)