In [1]:
import json
import random

characters = {}

for i in range(1,11):
    # Each seasons file path
    f = open('json_data/friends_season_'+ str(i).zfill(2) +'.json', 'r')
    # Loading seaosns JSON
    season = json.loads(f.read())
    # Retrieve episodes
    episodes = season['episodes']
    # Iterate through the episodes
    for episode in episodes:
        # Retrieve scenes
        scenes = episode['scenes']
        # Iterate through the scenes
        for scene in scenes:
            # Retrieve utterances
            utterances = scene['utterances']
            # Iterate through the utterances
            for utterance in utterances:
                speaker = utterance['speakers']
                if len(speaker) == 1:
                    speaker = speaker[0]
                    characters[speaker] = characters.get(speaker, 0) + 1

# We will use characters who have spoken the most utterances and remaining will be considered in 'others' class
entities = [i[0] for i in sorted(characters.items(), key = lambda kv: kv[1], reverse=True)[:6]]
# Thus we will add two more classes 'Others' and 'None'
entities += ['Others', 'None']
entities

['Rachel Green',
 'Ross Geller',
 'Chandler Bing',
 'Monica Geller',
 'Joey Tribbiani',
 'Phoebe Buffay',
 'Others',
 'None']

In [2]:
data = []
labels = []        
        
# Sepearte out data and labels
for i in range(1,11):
    # Each seasons file path
    f = open('json_data/friends_season_'+ str(i).zfill(2) +'.json', 'r')
    # Loading seaosns JSON
    season = json.loads(f.read())
    # Retrieve episodes
    episodes = season['episodes']
    # Iterate through the episodes
    for episode in episodes:
        # Retrieve scenes
        scenes = episode['scenes']
        # Iterate through the scenes
        for scene in scenes:
            # Retrieve utterances
            utterances = scene['utterances']
            # Iterate through the utterances
            for utterance in utterances:
                speaker = utterance['speakers']
                if len(speaker) == 1:
                    try:
                        tokens = utterance['tokens']
                        speaker = utterance['speakers']
                        character_entities = utterance['character_entities']


                        for i in range(len(tokens)):
                            if character_entities[i]:
                                target = ['None'] * len(tokens[i])
                                for e in character_entities[i]:
                                    indexes = list(range(e[0], e[1]))
                                    for j in indexes:
                                        if e[2] in entities:
                                            target[j] = e[2]
                                        else:
                                            target[j] = 'Others'

                                # Insert data
                                if speaker[0] in entities:
                                    data.append({'speaker': speaker[0], 'tokens': tokens[i]})
                                    labels.append(target)
                                else:
                                    data.append({'speaker': 'Others', 'tokens': tokens[i]})
                                    labels.append(target)
                    except:
                        continue

# Shuffling the data 
temp = list(zip(data, labels))
random.shuffle(temp)
data, labels = zip(*temp)

train_thresh = int(len(data)*0.8)

train_data = data[:train_thresh]
train_labels = labels[:train_thresh]

test_data = data[train_thresh:]
test_labels = labels[train_thresh:]

print(len(train_data))
print(len(train_labels))
print(len(test_data))
print(len(test_labels))

21121
21121
5281
5281


In [3]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

import re
import string
import numpy as np
import os.path

# Create a directory 'pretrained_embeds/' in the same directory as this notebook
# Download twitter embeddings from http://nlp.stanford.edu/data/glove.twitter.27B.zip
# Unzip it and place file 'glove.twitter.27B.25d.txt' in 'pretrained_embeds/' directory.

# We are doing it with 25 dimensional word embeddings, however we can try doing with more 
# dimensional embeddings available.

# If glove embeds is not in word2vec form then first convert it then load it
if os.path.isfile('pretrained_embeds/gensim_glove_vectors.txt'):
    glove_model = KeyedVectors.load_word2vec_format("pretrained_embeds/gensim_glove_vectors.txt", binary=False)
else:
    glove2word2vec(glove_input_file="pretrained_embeds/glove.twitter.27B.25d.txt", word2vec_output_file="pretrained_embeds/gensim_glove_vectors.txt")
    glove_model = KeyedVectors.load_word2vec_format("pretrained_embeds/gensim_glove_vectors.txt", binary=False)

def get_embed(word):
    # Case folding
    word = word.lower()
    try:
        return (glove_model.get_vector(word))
    except:
        return (glove_model.get_vector('unk'))


In [7]:
train_input = []
train_label_index = []

test_input = []
test_label_index = []

# Loop to get embeddings in train_input 
for d in train_data:
    temp = []
    
    # Getting embeddings for tokens 
    for t in d['tokens']:
        temp.append(get_embed(t))
    temp = np.asarray(temp)
    
    # Gettting embeddings for speakers (We are taking average of first name and last name embeddings)
    temp_speaker = []
    for s in d['speaker'].split():
        temp_speaker.append(get_embed(s))
    temp_speaker = np.asarray(temp_speaker)
    temp_speaker = np.mean(temp_speaker, axis=0).reshape(1,-1)
    
    # Concatinating speaker embedding with utterance embeddings
    temp = np.vstack((temp_speaker, temp))
    train_input.append(temp)
    
    
# Loop to get embeddings in test_input
for d in test_data:
    temp = []
    
    # Getting embeddings for tokens 
    for t in d['tokens']:
        temp.append(get_embed(t))
    temp = np.asarray(temp)
    
    # Gettting embeddings for speakers (We are taking average of first name and last name embeddings)
    temp_speaker = []
    for s in d['speaker'].split():
        temp_speaker.append(get_embed(s))
    temp_speaker = np.asarray(temp_speaker)
    temp_speaker = np.mean(temp_speaker, axis=0).reshape(1,-1)
    
    # Concatinating speaker embedding with utterance embeddings
    temp = np.vstack((temp_speaker, temp))
    test_input.append(temp)
    
    
# Loop to get embeddings in test_input
for ls in train_labels:
    # Appending a None to compensate for speaker added in embeddings
    ls = ['None'] + ls
    
    temp = []
    
    for l in ls:
        temp.append(entities.index(l))
    temp = np.asarray(temp)
    train_label_index.append(temp)
    
# Loop to get embeddings in test_input
for ls in test_labels:
    # Appending a None to compensate for speaker added in embeddings
    ls = ['None'] + ls
    
    temp = []
    
    for l in ls:
        temp.append(entities.index(l))
    temp = np.asarray(temp)
    test_label_index.append(temp)
    
    

print(len(train_input))
print(len(train_label_index))
print(len(test_input))
print(len(test_label_index))

21121
21121
5281
5281


In [10]:
data_path = 'data/'

# Label map to get one hot vector from labels
np.save(data_path + 'entities.npy', entities)
# Train input embeddings
np.save(data_path + 'train_input.npy', train_input)
# Train labels in form indexes from entity map
np.save(data_path + 'train_label_index.npy', train_label_index)
# Test input embeddings 
np.save(data_path + 'test_input.npy', test_input)
# Test labels in form indexes from entity map
np.save(data_path + 'test_label_index.npy', test_label_index)