## Author: OMKAR ANANT BARE

In [None]:
import string
import re
import numpy as np
from numpy.linalg import norm
import pandas as pd
from collections import Counter, OrderedDict

import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


%matplotlib inline
pd.options.display.max_colwidth=500

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/omkarbare/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/omkarbare/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/omkarbare/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/omkarbare/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# Load in training data and display in pandas dataframe
train_path='training.csv'
all_train_data = pd.read_csv(train_path,  delimiter="\t", skip_blank_lines = True)
test_path ='test.csv'
test_data = pd.read_csv(test_path,  delimiter="\t", skip_blank_lines = True)

# Inspect
display(all_train_data)

Unnamed: 0,Episode,Scene,Scene_info,Character_name,Line,Gender
0,1350,1,DESERTED CAR PARK EXT NIGHT,SHIRLEY,"Look at ya, not a mark on ya. And you think you're an unlucky man.",FEMALE
1,1350,1,DESERTED CAR PARK EXT NIGHT,OTHER,Shirl...,MALE
2,1350,2,R&R INT NIGHT,JACK,Oi. Where have you been? Huh? What were the texts about?,MALE
3,1350,2,R&R INT NIGHT,RONNIE,Nothing. Nothing. I'll be with you in two minutes yeah?,FEMALE
4,1350,2,R&R INT NIGHT,JACK,"Well I've got mates here I wanted to have a chat with them, instead I've been serving behind the bar.",MALE
...,...,...,...,...,...,...
15314,1399,55,SQUARE EXT DAY LIGHT,OTHER,"Dad? Okay ... alright, just one drink alright. But that's all. It doesn't mean anything. It's just a drink.",MALE
15315,1399,55,SQUARE EXT DAY LIGHT,MAX,Thanks Bradley. Thanks mate... It means the world to me...,MALE
15316,1399,55,SQUARE EXT DAY LIGHT,OTHER,You alright...,MALE
15317,1399,55,SQUARE EXT DAY LIGHT,MAX,"Yeah, yeah, yeah. I'm fine.",MALE


In [None]:
# Split into training and test data for heldout validation with random samples of 9:1 train/heldout split
from random import shuffle, seed

seed(0) # set a seed for reproducibility so same split is used each time

epsiode_scene_column = all_train_data.Episode.astype(str) + "-" + all_train_data.Scene.astype(str)
all_train_data['episode_scene'] = epsiode_scene_column
episode_scenes = sorted(list(set([x for x in epsiode_scene_column.values]))) # need to sort to ensure same initial order

shuffle(episode_scenes)

print(len(episode_scenes))
episode_split = int(0.9*len(episode_scenes))
training_ep_scenes = episode_scenes[:episode_split]
test_ep_scenes = episode_scenes[episode_split:]
print(len(training_ep_scenes), len(test_ep_scenes))

def train_or_heldout_eps(val):
    if val in training_ep_scenes:
        return "training"
    return "heldout"

all_train_data['train_heldout'] = all_train_data['episode_scene'].apply(train_or_heldout_eps)

1394
1254 140


In [None]:
print('Raw Data: ',np.shape(all_train_data))
train_data = all_train_data[all_train_data['train_heldout']=='training']
val_data = all_train_data[all_train_data['train_heldout']=='heldout']
print('Train set: ',np.shape(train_data))
print('Validation set: ',np.shape(val_data))

Raw Data:  (15319, 8)
Train set:  (13638, 8)
Validation set:  (1681, 8)


## adding scene info

In [None]:
# Create one document per character
def create_character_document_from_dataframe(df, max_line_count):
    """Returns a dict with the name of the character as key,
    their lines joined together as a single string, with end of line _EOL_
    markers between them.
    
    ::max_line_count:: the maximum number of lines to be added per character
    """
    character_docs = {}
    character_line_count = {}
    scene_information = {}
    
    for line, name, s_info in zip(df.Line, df.Character_name, df.Scene_info): # added scene info
        if not name in character_docs.keys():
            
            scene_information[name] = {}
            for scene_info in df.Scene_info:
                scene_info = False
            
            character_docs[name] = ""
            character_line_count[name] = 0
            
        if character_line_count[name] == max_line_count:
            continue
            
        character_docs[name] += str(line)   + " _EOL_ "  # adding an end-of-line token
        character_line_count[name]+=1
        
        # Add scene information
        scene_information[name][s_info] = True 

    return character_docs, scene_information

In [None]:
# print out the number of words each character has in the training set
# only use the first 360 lines of each character
train_character_docs, train_scene_info = create_character_document_from_dataframe(train_data, max_line_count=360)
                                                                                 
print('Num. Characters: ',len(train_character_docs.keys()),"\n")
total_words = 0
for name in train_character_docs.keys():
    print(name, 'Number of Words: ',len(train_character_docs[name].split()))
    total_words += len(train_character_docs[name].split())
    
print("\nTotal words:", total_words)

Num. Characters:  16 

SHIRLEY Number of Words:  3848
OTHER Number of Words:  3244
JACK Number of Words:  4435
RONNIE Number of Words:  3442
TANYA Number of Words:  3786
SEAN Number of Words:  3637
ROXY Number of Words:  3838
HEATHER Number of Words:  4098
MAX Number of Words:  4363
IAN Number of Words:  4332
JANE Number of Words:  3648
STACEY Number of Words:  3913
PHIL Number of Words:  3635
MINTY Number of Words:  4005
CHRISTIAN Number of Words:  3738
CLARE Number of Words:  4344

Total words: 62306


In [None]:
# best possible combination of preprocessing switches
preprocessing_switches = {
        'convert_numbers'      : False,
        'separate_punctuation' : True, 
        'lowercase'            : False,
        'remove_punctuation'   : True,
        'apply_lemmatization'  : True,
        'remove_stopwords'     : True
    }

In [None]:
def pre_process(character_text):
    '''
    Pre-process all the concatenated lines of a character:
    
    - convert_numbers: convert any numbers to string 'NUMBER'
    - separate_punctuation:  separate punctuations at the end and the beginning
    - lowercase : lower-casing
    - remove_punctuation: removing any punctuations
    - apply_lemmatization: apply lemmatization technoque
    - remove_stopwords: removes common stop words in english language
    
    ::character_text:: a string with all of one character's lines
    '''
    
    # convert any numbers to string 'NUMBER'
    if preprocessing_switches['convert_numbers'] == True:
        character_text = re.sub('\d+', 'NUMBER', character_text)
     
    # simple tokenization on white space
    tokens = character_text.split()
        
    # separates punctuation at beginning and end of strings
    if preprocessing_switches['separate_punctuation'] == True:
        tokens = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", character_text) # separates punctuation at end of strings
        tokens = re.sub(r"([.,;:!?'\"“\(\)])(\w)", r"\1 \2", tokens) # separates punctuation at beginning of strings
        tokens = re.split(r"\s+", tokens)
    
    # normalisation - lower casing 
    if preprocessing_switches['lowercase'] == True:
        tokens = [t.lower() for t in tokens]
    
    # remove puctuations
    if preprocessing_switches['remove_punctuation'] == True:
        tokens = [word for word in tokens if word.isalpha()] #'isalpha()' method checks if string consists of alphabetic characters only
    
    # Lemmatizing sentence
    if preprocessing_switches['apply_lemmatization'] == True:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(words_sent) for words_sent in tokens]
    
    # Removing stop words
    processed_data = []
    if preprocessing_switches['remove_stopwords'] == True:
        stop_words = set(stopwords.words('english'))

        for t in tokens:
            if t not in stop_words:
                processed_data.append(t)
        return processed_data
    
    return tokens

In [None]:
# create list of pairs of (character name, pre-processed character) 
training_corpus = [(name, pre_process(doc)) for name, doc in sorted(train_character_docs.items())]
train_labels = [name for name, doc in training_corpus]

In [None]:
def pos_tagging(tokens):
    tagged_words = nltk.pos_tag(tokens)
    pos_tagged_sentence = [word + "-" + tag for word, tag in tagged_words]
    return pos_tagged_sentence

In [None]:
# best setting
_WEIGHT_ = "counts"  
_N_ = 2

##  `to_feature_vector_dictionary` function edited for question 4

In [None]:
def to_feature_vector_dictionary(character_doc, scene_info, extra_features):
    '''
    function to extract different ngram sequences from tokens and add different weightings to them
    '''
    # SOLUTION: a method to extract different ngram sequences from tokens
    # and different weighting on those counts
    
    feature_vector_dict =  Counter()  
    
    # collect the counts for all n in range (1, N)
    for n in range(1, _N_+1):
        tokens = ["<s>"]*(n-1) + character_doc + ["</s>"]
        
        # perform pos tagging
        new_tokens = pos_tagging(tokens)
            
        for i in range(n-1, len(new_tokens)):
            raw_ngram = " ".join(new_tokens[i-(n-1):i+1])
            n_gram = "{}@{}".format(n, raw_ngram)     
            feature_vector_dict[n_gram]+=1
        
    # binary counts (1 if present)
    if _WEIGHT_ == "binary":
        feature_vector_dict = {x:1 for x in feature_vector_dict.keys()}
        
    # number of counts
    if _WEIGHT_ == "counts":
        counts = Counter(character_doc)  
        feature_vector_dict = dict(counts)   
        
    # bag-of-words counts
    elif _WEIGHT_ == "weighted": 
        feature_vector_dict = {x:feature_vector_dict[x]/(len(character_doc)+1) for x in feature_vector_dict.keys()}
    
    feature_vector_dict.update(scene_info)
    return feature_vector_dict

In [None]:
corpusVectorizer = DictVectorizer()   # corpusVectorizor which will just produce sparse vectors from feature dicts
# Any matrix transformers (e.g. tf-idf transformers) should be initialized here


def create_document_matrix_from_corpus(corpus, scene_info, fitting=False):
    """Method which fits different vectorizers
    on data and returns a matrix.
    
    Currently just does simple conversion to matrix by vectorizing the dictionary. Improve this for Q3.
    
    ::corpus:: a list of (class_label, document) pairs.
    ::fitting:: a boolean indicating whether to fit/train the vectorizers (should be true on training data)
    """
    
    # uses the global variable of the corpus Vectorizer to improve things
    if fitting:
        corpusVectorizer.fit([to_feature_vector_dictionary(doc, scene_info[name], []) for name, doc in corpus])
    doc_feature_matrix = corpusVectorizer.transform([to_feature_vector_dictionary(doc, scene_info[name], []) for name, doc in corpus])
    
    #training_feature_matrix[0].toarray()
    return doc_feature_matrix

training_feature_matrix = create_document_matrix_from_corpus(training_corpus, train_scene_info, fitting=True)

In [None]:
training_feature_matrix

<16x4476 sparse matrix of type '<class 'numpy.float64'>'
	with 13138 stored elements in Compressed Sparse Row format>

In [None]:
# get the validation data- only 40 lines used for each character
val_character_docs, val_scene_info = create_character_document_from_dataframe(val_data, max_line_count=40)
print('Num. Characters: ',len(val_character_docs.keys()),"\n")
total_words = 0
for name in val_character_docs.keys():
    print(name, 'Num of Words: ',len(val_character_docs[name].split()))
    total_words += len(val_character_docs[name].split())
print("total words", total_words)

# create list of pairs of (character name, pre-processed character) 
val_corpus = [(name, pre_process(doc)) for name, doc in sorted(val_character_docs.items())]
val_labels = [name for name, doc in val_corpus]

Num. Characters:  16 

TANYA Num of Words:  438
MAX Num of Words:  737
SEAN Num of Words:  366
SHIRLEY Num of Words:  329
OTHER Num of Words:  357
STACEY Num of Words:  412
RONNIE Num of Words:  464
JACK Num of Words:  351
PHIL Num of Words:  475
IAN Num of Words:  508
JANE Num of Words:  458
ROXY Num of Words:  392
HEATHER Num of Words:  411
MINTY Num of Words:  470
CHRISTIAN Num of Words:  489
CLARE Num of Words:  405
total words 7062


In [None]:
# Just transform the val_feature_matrix, don't fit
val_feature_matrix = create_document_matrix_from_corpus(val_corpus, val_scene_info, fitting=False)

In [None]:
val_feature_matrix

<16x4476 sparse matrix of type '<class 'numpy.float64'>'
	with 2359 stored elements in Compressed Sparse Row format>

In [None]:
def compute_cosine_similarity(v1, v2):
    """Takes a pair of vectors v1 and v2 (1-d arrays e.g. [0, 0.5, 0.5])
    returns the cosine similarity between the vectors
    """
    
    # compute cosine similarity manually
    manual_cosine_similarity = np.dot(v1, v2)  /(norm(v1) * norm(v2))
    
    return manual_cosine_similarity

In [None]:
def compute_IR_evaluation_scores(train_feature_matrix, test_feature_matrix, train_labels, test_labels, display=False):
    """
    Computes an information retrieval based on training data feature matrix and test data feature matrix
    returns 4-tuple:
    ::mean_rank:: mean of the ranking of the target document in terms of similarity to the query/test document
    1 is the best possible score.
    ::mean_cosine_similarity:: mean cosine similarity score for the target document vs. the test document of the same class
    ::accuracy:: proportion of test documents correctly classified
    ::df:: a data frame with all the similarity measures of the test documents vs. train documents
    
    params:
    ::train_feature_matrix:: a numpy matrix N x M shape where N = number of characters M = number of features
    ::test_feature_matrix::  a numpy matrix N x M shape where N = number of characters M = number of features
    ::train_labels:: a list of character names for the training data in order consistent with train_feature_matrix
    ::test_labels:: a list of character names for the test data in order consistent with test_feature_matrix
    """
    rankings = []
    all_cosine_similarities = []
    pairwise_cosine_similarity = []
    pairs = []
    correct = 0
    for i, target in enumerate(test_labels):
        # compare the left out character against the mean
        idx = i 
        fm_1 = test_feature_matrix.toarray()[idx]
        all_sims = {}
        # print("target:", target)
        for j, other in enumerate(train_labels):
            fm_2 = train_feature_matrix.toarray()[j]
            manual_cosine_similarity = compute_cosine_similarity(fm_1, fm_2)
            pairs.append((target, other))
            pairwise_cosine_similarity.append(manual_cosine_similarity)
            if other == target:
                all_cosine_similarities.append(manual_cosine_similarity)
            all_sims[other] = manual_cosine_similarity

            # print(target, other, manual_cosine_similarity)
        sorted_similarities = sorted(all_sims.items(),key=lambda x:x[1],reverse=True)
        # print(sorted_similarities)
        ranking = {key[0]: rank for rank, key in enumerate(sorted_similarities, 1)}
        # print("Ranking for target", ranking[target])
        if ranking[target] == 1:
            correct += 1
        rankings.append(ranking[target])
        # print("*****")
    mean_rank = np.mean(rankings)
    mean_cosine_similarity = np.mean(all_cosine_similarities)
    accuracy = correct/len(test_labels)
    
    if display == True:
        print("mean rank", np.mean(rankings))
        print("mean cosine similarity", mean_cosine_similarity)
        print(correct, "correct out of", len(test_labels), "/ accuracy:", accuracy)
    
    # get a dafaframe showing all the similarity scores of training vs test docs
    df = pd.DataFrame({'doc1': [x[0] for x in pairs], 'doc2': [x[1] for x in pairs],
                       'similarity': pairwise_cosine_similarity})

    # display characters which are most similar and least similar
    df.loc[[df.similarity.values.argmax(), df.similarity.values.argmin()]]
    return (mean_rank, mean_cosine_similarity, accuracy, df)

In [None]:
def plot_heat_map_similarity(df):
    """Takes a dataframe with header 'doc1, doc2, similarity'
    Plots a heatmap based on the similarity scores.
    """
    test_labels =  sorted(list(set(df.sort_values(['doc1'])['doc1'])))
    # add padding 1.0 values to either side
    cm = [[1.0,] * (len(test_labels)+2)]
    for target in test_labels:
        new_row = [1.0]
        for x in df.sort_values(['doc1', 'doc2'])[df['doc1']==target]['similarity']:
            new_row.append(x)
        new_row.append(1.0)
        cm.append(new_row)
    cm.append([1.0,] * (len(test_labels)+2))
    #print(cm)
    labels = [""] + test_labels + [""]
    fig = plt.figure(figsize=(20,20))
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    plt.title('Similarity matrix between documents as vectors')
    fig.colorbar(cax)
    ax.set_xticks(np.arange(len(labels)))
    ax.set_yticks(np.arange(len(labels)))
    ax.set_xticklabels( labels, rotation=45)
    ax.set_yticklabels( labels)

    for i in range(len(cm)):
        for j in range(len(cm)):

            text = ax.text(j, i, round(cm[i][j],3),
                           ha="center", va="center", color="w")

    plt.xlabel('Training Vector Doc')
    plt.ylabel('Test Vector Doc')
    #fig.tight_layout()
    plt.show()

### results after adding scene information

In [None]:
mean_rank, mean_cosine_simliarity, acc, df = compute_IR_evaluation_scores(training_feature_matrix, 
                                                                          val_feature_matrix, 
                                                                          train_labels, 
                                                                          val_labels,
                                                                          display=True)  # set display = True to print results          

mean rank 1.375
mean cosine similarity 0.8049988059079553
13 correct out of 16 / accuracy: 0.8125


## Trying different matrix transformation techniques

- Trying **TF-IDF** matrix transformation technique

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

corpusVectorizer = DictVectorizer()   # corpusVectorizor which will just produce sparse vectors from feature dicts
# Any matrix transformers (e.g. tf-idf transformers) should be initialized here
tfidf = TfidfTransformer()


def create_document_matrix_from_corpus(corpus, scene_info, fitting=False):
    """Method which fits different vectorizers
    on data and returns a matrix.
    
    Currently just does simple conversion to matrix by vectorizing the dictionary. Improve this for Q3.
    
    ::corpus:: a list of (class_label, document) pairs.
    ::fitting:: a boolean indicating whether to fit/train the vectorizers (should be true on training data)
    """
    
    # uses the global variable of the corpus Vectorizer to improve things
    if fitting:
        corpusVectorizer.fit([to_feature_vector_dictionary(doc, scene_info[name], []) for name, doc in corpus])
    doc_feature_matrix = corpusVectorizer.transform([to_feature_vector_dictionary(doc, scene_info[name], []) for name, doc in corpus])
    
    # converts a matrix to tf-idf representation
    doc_feature_matrix = tfidf.fit_transform(doc_feature_matrix)
    
    #training_feature_matrix[0].toarray()
    return doc_feature_matrix

#### results after improving vectorization method 

In [None]:
# get the training data - only 360 lines used for each character
train_character_docs, train_scene_info = create_character_document_from_dataframe(train_data, max_line_count=360)
training_corpus = [(name, pre_process(doc)) for name, doc in sorted(train_character_docs.items())]
train_labels = [name for name, doc in training_corpus]

training_feature_matrix = create_document_matrix_from_corpus(training_corpus, train_scene_info, fitting=True)

# get the validation data- only 40 lines used for each character
val_character_docs, val_scene_info = create_character_document_from_dataframe(val_data, max_line_count=40)
# create list of pairs of (character name, pre-processed character) 
val_corpus = [(name, pre_process(doc)) for name, doc in sorted(val_character_docs.items())]
val_labels = [name for name, doc in val_corpus]

# Just transform the val_feature_matrix, don't fit
val_feature_matrix = create_document_matrix_from_corpus(val_corpus, val_scene_info, fitting=False)

print('\nResults on validation set:')
mean_rank, mean_cosine_simliarity, acc, df = compute_IR_evaluation_scores(training_feature_matrix, 
                                                                          val_feature_matrix, 
                                                                          train_labels, 
                                                                          val_labels, 
                                                                          display=True) # set display = True to print results  


Results on validation set:
mean rank 1.125
mean cosine similarity 0.576010719865794
15 correct out of 16 / accuracy: 0.9375


## best system trained on all of the training data (using the first 400 lines per character) and final testing done on the test file (using the first 40 lines per character)

In [None]:
# best possible combination of preprocessing switches
preprocessing_switches = {
        'convert_numbers'      : False,
        'separate_punctuation' : True, 
        'lowercase'            : False,
        'remove_punctuation'   : True,
        'apply_lemmatization'  : True,
        'remove_stopwords'     : True
    }
# best setting
_WEIGHT_ = "counts"  
_N_ = 2

In [None]:
# redo on all training data with the first 400 character lines used
train_character_docs, train_scene_info = create_character_document_from_dataframe(all_train_data, max_line_count=400)
print('Num. Characters: ',len(train_character_docs.keys()),"\n")
total_words = 0
for name in train_character_docs.keys():
    print(name, 'Number of Words: ',len(train_character_docs[name].split()))
    total_words += len(train_character_docs[name].split())
print("total words", total_words)

training_corpus = [(name, pre_process(doc)) for name, doc in train_character_docs.items()]
train_labels = [name for name, doc in training_corpus]

corpusVectorizer = DictVectorizer()   # initialize a corpusVectorizor which will output sparse vectors from dicts
# Any matrix transformers (e.g. tf-idf transformers) should be initialized here
tfidf = TfidfTransformer()


training_feature_matrix = create_document_matrix_from_corpus(training_corpus, train_scene_info, fitting=True)

# get the test data using 40 lines per character
test_character_docs, val_scene_info = create_character_document_from_dataframe(test_data, max_line_count=40)
print('Num. Characters: ',len(test_character_docs.keys()),"\n")
total_words = 0
for name in test_character_docs.keys():
    print(name, 'Number of Words: ',len(test_character_docs[name].split()))
    total_words += len(test_character_docs[name].split())
print("total words", total_words)

# create list of pairs of (character name, pre-processed character) 
test_corpus = [(name, pre_process(doc)) for name, doc in test_character_docs.items()]
test_labels = [name for name, doc in test_corpus]


# Just transform the val_feature_matrix, don't fit
test_feature_matrix = create_document_matrix_from_corpus(test_corpus, val_scene_info, fitting=False)


mean_rank, mean_cosine_simliarity, acc, df = compute_IR_evaluation_scores(training_feature_matrix, test_feature_matrix, train_labels, test_labels, display=True)

Num. Characters:  16 

SHIRLEY Number of Words:  4233
OTHER Number of Words:  3606
JACK Number of Words:  4786
RONNIE Number of Words:  3872
TANYA Number of Words:  4214
SEAN Number of Words:  4026
ROXY Number of Words:  4200
HEATHER Number of Words:  4504
MAX Number of Words:  5107
IAN Number of Words:  4863
JANE Number of Words:  4117
STACEY Number of Words:  4325
PHIL Number of Words:  4103
MINTY Number of Words:  4391
CHRISTIAN Number of Words:  4250
CLARE Number of Words:  4844
total words 69441
Num. Characters:  16 

SHIRLEY Number of Words:  373
OTHER Number of Words:  453
HEATHER Number of Words:  451
PHIL Number of Words:  406
SEAN Number of Words:  466
TANYA Number of Words:  465
MAX Number of Words:  494
JACK Number of Words:  412
IAN Number of Words:  509
JANE Number of Words:  414
STACEY Number of Words:  634
ROXY Number of Words:  392
RONNIE Number of Words:  390
CHRISTIAN Number of Words:  629
MINTY Number of Words:  428
CLARE Number of Words:  368
total words 7284
mean 