# Move Dialogue: Base Task

## Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import math
nltk.download('punkt')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

## Importing Dataset

Source for how to import data:  
https://www.kaggle.com/shashankasubrahmanya/preprocessing-cornell-movie-dialogue-corpus

In [None]:
movie_lines_features = ["LineID", "Character", "Movie", "Name", "Line"]


In [None]:
movie_lines = pd.read_csv("data/movie_lines.txt", sep = "\+\+\+\$\+\+\+", engine = "python", encoding='ISO-8859-1', index_col = False, names = movie_lines_features)


## Data Cleaning

In [None]:
movie_lines = movie_lines.sample(frac=0.1)

In [None]:
movie_lines.head()

In [None]:
# Strip the space from "LineID" for further usage and change the datatype of "Line"
movie_lines["LineID"] = movie_lines["LineID"].apply(str.strip)
movie_lines["Line"] = movie_lines["Line"].apply(str)
movie_lines["Line"] = movie_lines["Line"].apply(str.lower)

In [None]:
movie_lines.head()

In [None]:
# # Remove lines that do not contain he or she pronouns
# # Only include lines where ' he ' is in middle of sentence, surrounded by at least 1 space on either side
# movie_lines = movie_lines[movie_lines["Line"].str.contains('|'.join([' he ', ' she ']))]

In [None]:
movie_lines.head()

### Tokenization and Segmentation

In [None]:
# Tokenize and segment
# Drop rows without pronouns

In [None]:
movie_lines["Segmented_Line"] = movie_lines["Line"].apply(sent_tokenize)

In [None]:
df = pd.DataFrame(columns=movie_lines.columns)

In [None]:
df.head()

In [None]:
for row in movie_lines.iterrows():
    for sentence in row[1]["Segmented_Line"]:
        line_id = row[1]["LineID"]
        character = row[1]["Character"]
        movie = row[1]["Movie"]
        name = row[1]["Name"]
        line = row[1]["Line"]
        segmented = sentence
        new_row = {"LineID":line_id, "Character": character,
                   "Movie":movie,"Name":name,"Line":line,
                   "Segmented_Line":segmented}
        df = df.append(new_row, ignore_index=True)

In [None]:
df.head()

In [None]:
movie_lines = df

In [None]:
movie_lines.head()

In [None]:
movie_lines["Tokenized_Line"] = movie_lines["Segmented_Line"].apply(word_tokenize)


In [None]:
movie_lines.head()

### Extracting Pronouns

In [None]:
def get_pronoun(tokenized_line):
    if "she" in tokenized_line:
        return "she"
    elif "he" in tokenized_line:
        return "he"
    else:
        return "none"

In [None]:
movie_lines["Pronoun"] = movie_lines["Tokenized_Line"].apply(get_pronoun)

In [None]:
movie_lines.head()

In [None]:
movie_lines = movie_lines.loc[movie_lines["Pronoun"] != "none"]

In [None]:
movie_lines.head()

In [None]:
movie_lines.describe()

In [None]:
movie_lines.shape

In [None]:
# For use in other notebooks
%store movie_lines

### Extracting Verb

In [None]:
def get_verb(tokenized_line):
    pronoun = get_pronoun(tokenized_line)
    pronoun_index = tokenized_line.index(pronoun)
    word_pos_pairs = nltk.pos_tag(tokenized_line[pronoun_index:], tagset='universal')
    pos_list = [pos for _, pos in word_pos_pairs]
    
    if 'VERB' not in pos_list:
        return 'none'
    
    verb_index = pos_list.index('VERB')
    # TODO: consider second verb after pronoun
    # ex: he's biking, 's and biking are both VERB
    return word_pos_pairs[verb_index][0]

In [None]:
movie_lines["Verb"] = movie_lines["Tokenized_Line"].apply(get_verb)

In [None]:
movie_lines.head()

In [None]:
movie_lines.describe()

### Lemmatization

In [None]:
def get_lemmatized_word(word):
    return lemmatizer.lemmatize(word, "v")

In [None]:
# LEMMATIZATION

lemmatizer = WordNetLemmatizer()

movie_lines["Verb"] = movie_lines["Verb"].apply(get_lemmatized_word)

## Counts

In [None]:
counts = Counter(list(movie_lines["Verb"].values))

In [None]:
hes = movie_lines.loc[movie_lines["Pronoun"] == "he"]
num_hes = hes.shape[0]

In [None]:
shes = movie_lines.loc[movie_lines["Pronoun"] == "she"]
num_shes = shes.shape[0]

In [None]:
he_list = list(hes["Verb"].values)

In [None]:
she_list = list(shes["Verb"].values)

In [None]:
he_counts = Counter(he_list)

In [None]:
she_counts = Counter(she_list)

In [None]:
# she_counts

In [None]:
# she_most_common["does"]

In [None]:
# he_most_common - she_most_common

In [None]:
# he_most_common.most_common(5)

In [None]:
intersection_he = set(he_list).intersection(set(she_list))
intersection_she = set(she_list).intersection(set(he_list))
intersection = set(he_list).intersection(set(she_list))
intersection_he == intersection_she

In [None]:
he_only = set(he_list) - set(she_list)

In [None]:
she_only = set(she_list) - set(he_list)

In [None]:
he_counts.get("should", 0)

In [None]:
Counter()

In [None]:
he_only_counts = Counter({k: he_counts.get(k) for k in he_only})
she_only_counts = Counter({k: she_counts.get(k) for k in she_only})

In [None]:
he_only_counts.most_common(3)

In [None]:
he_counts["could"]

#### Laplace Smoothing

In [None]:
# Laplace smoothing

for item in counts:
    he_counts[item] += 1

for item in counts:
    she_counts[item] += 1

#### Relative Frequency

In [None]:
verbs = list(counts.keys())

he_freqs = []
she_freqs = []
freqs = []

for verb in verbs:
    he_rel_freq = he_counts[verb] / num_hes
    she_rel_freq = she_counts[verb] / num_shes
    he_freqs.append(he_rel_freq / she_rel_freq)
    she_freqs.append(she_rel_freq / he_rel_freq)
    if he_rel_freq > she_rel_freq:
        freq = (he_rel_freq / she_rel_freq) * -1
        freq += 1
        freqs.append(freq)
    else:
        freq = (she_rel_freq / he_rel_freq)
        freq += -1
        freqs.append(freq)

verb_freqs = {verb: freq for (verb, freq) in list(zip(verbs, freqs))}

In [None]:
he_verb_freqs = {verb: freq for (verb, freq) in list(zip(verbs, he_freqs))}
she_verb_freqs = {verb: freq for (verb, freq) in list(zip(verbs, she_freqs))}

In [None]:
he_freqs_counter = Counter(he_verb_freqs)
she_freqs_counter = Counter(she_verb_freqs)

In [None]:
num_verbs_plot = 15

In [None]:
he_verbs = [verb for (verb, _) in he_freqs_counter.most_common(num_verbs_plot)]
she_verbs = [verb for (verb, _) in she_freqs_counter.most_common(num_verbs_plot)]
he_freqs = [verb_freqs[verb] for verb in he_verbs]
she_freqs = [verb_freqs[verb] for verb in she_verbs]

In [None]:
verbs = he_verbs + she_verbs
freqs = he_freqs + she_freqs

#### Plotting

In [None]:
plt.rcdefaults()
fig, ax = plt.subplots()

y_pos = np.arange(len(verbs))
ax.barh(y_pos, freqs, align="center")
ax.set_yticks(y_pos, labels=verbs)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('Relative Frequency Ratio (He <0; She >0) ')
ax.set_title('Relative Frequency Ratio of Verbs Following Certain Pronouns {She, He}')

plt.show()