In [None]:
#you may need this:#
!pip install spacy

In [None]:
#And this#
!python -m spacy download en

In [None]:
#One more#
!pip install nltk

In [None]:
from convokit import Corpus, User, Utterance, Prominence
import json
from collections import Counter

In [None]:
input_files = ['friends_season_01.json',
                  'friends_season_02.json',
                  'friends_season_03.json',
                  'friends_season_04.json',
                  'friends_season_05.json',
                  'friends_season_06.json',
                  'friends_season_07.json',
                  'friends_season_08.json',
                  'friends_season_09.json',
                  'friends_season_10.json']

In [None]:
#for the purposes of the notebook, here's a means of dowloading the corpus virtually.#
import requests
import json

# read the JSON file from the web
for file in input_files:
    link = 'https://raw.githubusercontent.com/emorynlp/character-mining/master/json/' + file
    r = requests.get(link)

#loading as seasons#
season = json.loads(r.text)

**USER CORPUS**
Developing a robust user corpus is a priority of my project given its focus on characterization. Knowing this, I spent the bulk of my working generating a few additional types of metadata to include with each user. Another priority of the code is to retain season-level information within the grander context of the series as a whole.

In future versions of this dataset, I would be interested in including episode-level and scene-level information about users. In terms of sourcing outside metadata, I think that Gender metadata could be sourced from crosslisting character names with IMDB.

Below, I offer an example of the current pipeline to give an overview of the conversion process. This pipeline  Since two definitions have not been defined yet, it is unfunctional as is.

In [None]:
character_distribution = []
for name in input_files:
    with open(name) as data:
        season = json.load(data)
        season_id = season['season_id']
        episodes = season['episodes']
        character_distribution.append([season_id,season_speaking_users(episodes)])
character_matrix = series_speaking_users(character_distribution)

The pipeline above stresses the production of a season-level _character distribution_ and a series-level _character matrix_. I divide the dataset as such in order to better account for the distribution of user dialogue and reference across different scales of narrative. The function **season_speaking_users** works to divide and count users into two main roles, speakers and figures of reference, and assign the quality of being a nonspeaker, a nonspeaking user who is referenced. Nonspeaking users fascinatingly are typically either famous guest stars (like Ed Begley Jr.) or entirely generic onscreen figures, like a silent airplane steward.

While seasons one through four feature an additional category called "character_entities" that refers to the characters mentioned in or around the conversation, this convention is dropped from season five on. That is, it's not possible using the current dataset to track nonspeaking users throughout the duration of the season. I do believe, however, it would be worthwhile to see if: 1.)nonspeaking characters reccur or become a type of trope (Are there silent characters who show up on screen and shrug for laugh? Is it usually poorly-disguised celebrities who fill these roles to produce a sight gag?) 2.)If the characaters who are referenced by main characters in earlier seasons end up getting more speaking roles as the series progresses.

In [None]:
def season_speaking_users(episodes):
    ssu = []
    sru = []
    for id in range(len(episodes)):
        episode = episodes[id]
        scenes = episode['scenes']
        for scene in scenes:
            for place in range(len((scene['utterances']))):
                utterance = scene['utterances'][place]
                speakers = utterance['speakers']
                ##1.) See markup below##
                if len(speakers) >= 1:
                    for p in range(len(speakers)):
                        ssu.append(speakers[p])
                if 'character_entities' in utterance:
                    character_entities = utterance['character_entities']
                    for place in range(len(character_entities)):
                        if len(character_entities[place]) >1:
                            character_range = character_entities[place]
                            for position in range(len(character_range)):
                                sru.append(character_range[position][2])
                
                
    ##counting##
    season_speaking_users = Counter(ssu)
    season_referenced_users = Counter(sru)
    season_non_speaking_users = []
    
    ##finding non-speaking users##
    for key in [i for i in season_referenced_users]:
        if key in [i for i in season_speaking_users]: 
            pass
        else:
            season_non_speaking_users.append(key)
    return [season_speaking_users, season_referenced_users, season_non_speaking_users]


The code above uses counters to measure the number of utterance and references each character makes. It also features a small amount of code to discover nonspeaking users. Ultimately, the code produces two dictionaries that take the names of characters for keys and return the number of utterances and references that respectively occur within the season. It also returns a list of nonspeaking users.

1.) One of the larger peculiarities of the dataset was instances where characters would talk in unison. I'm not sure if there's a good way to parse this - should the collective be treated as a single user? - though it would be interesting to see if there are certain combinations that happen frequently throughout the seasons.

In [None]:
def series_speaking_users(character_distribution):
   
    ## 1.##
    all_characters =[]
    for season in character_distribution:
        speakers = season[1][0]
        referees = season[1][1]
        nonspeakers = season[1][2]
        for speaker in speakers:
            if speaker not in all_characters:
                all_characters.append(speaker)
        for referee in referees:
            if referee not in all_characters:
                all_characters.append(referee)
        for ns in nonspeakers:
            if ns not in all_characters:
                    all_characters.append(ns)
    ## 2. ##                
    characters_tagged = dict()
    for character in all_characters:
        total_spoken = 0
        season_spoken = []
        total_referenced = 0
        season_referenced = []
        for season in character_distribution:
            speakers = season[1][0]
            referees = season[1][1]
            season_id = season[0]
            total_spoken += speakers[character]
            season_spoken.append([season_id , speakers[character]])
            total_referenced += referees[character]
            season_referenced.append([season_id, referees[character]])
        characters_tagged[character] = [total_spoken, season_spoken, total_referenced, season_referenced]
    return characters_tagged

1.) In order to collect series-level statistics from the season-level this function's first half collects the names of all users across the two qualities and nonspeaking condition.

2.) That list is then fed into and combined with the season-level dictionaries created by the previous function. Two **total** qualities track all of the utterances and references that occur for a user within the series, while season_spoken and season_reference return each season and its counts. Part of the intuition for this approach is to see how much a character's share and quantity of dialogue changes through the course of several seasons.

In [None]:
character_distribution = []
for file in input_files:
    link = 'https://raw.githubusercontent.com/emorynlp/character-mining/master/json/' + file
    r = requests.get(link)
    season = json.loads(r.text)
    season_id = season['season_id']
    episodes = season['episodes']
    character_distribution.append([season_id,season_speaking_users(episodes)])
character_matrix = series_speaking_users(character_distribution)

Let's see an example of how the character_matrix works!

In [None]:
character_matrix['Rachel Green']

In [None]:
user_meta = {}
for user in character_matrix.keys():
    user_meta[user] =   {"character_name": character_matrix[user],
                               "total_utterances": character_matrix[user][0],
                               "utterances_per_season": character_matrix[user][1],
                               "total_references": character_matrix[user][2],
                               "references_per_season": character_matrix[user][3]}
                

##making Corpus##
corpus_users = {k: User(name = k, meta = v) for k,v in user_meta.items()}

In this version of the User Corpus, each user ends up with five characteristics. I intend to use the proportions of these characteristics to determine what role utterance and reference quantity play in separating major and minor characters across seasons.

In terms of adding future metadata, I would be interested in seeing the average-sized converation each character participate in per season and the average size of the groups those conversations take place in. When thinking about major and minor characters, it would be curious to see if speaking-but-not-major characters tend to be relegated to certain group sizes or shorter conversational instances.

What I'd love most in terms of metadata, however, was a way to parse the "transcript with note" subcategory in each utterance for character information/set directions. It would be fascinating to see which physical behaviors and details get assigned to which character.

**UTTERANCE CORPUS**
Given my project's interest in users, I've made significantly fewer modifications in generating metadata for the utterance corpus. Nevertheless, I believe that in future iterations of this code, the utterance corpus will make some of the user-processing accomodations I made above redundant. So it goes with version 1!



In [None]:
utterance_corpus = {}
for file in input_files:
    link = 'https://raw.githubusercontent.com/emorynlp/character-mining/master/json/' + file
    r = requests.get(link)
    season = json.loads(r.text)
    season_id = season['season_id']
    episodes = season['episodes']
    #############################
    for id in range(len(episodes)):
        episode = episodes[id]
        scenes = episode['scenes']
        for scene in scenes:
            for place in range(len((scene['utterances']))):
                utterance = scene['utterances'][place]
                id = utterance['utterance_id']

                ##1.)Some utterances involve multiple speakers stored as a list. This process treats them individually, though it is worth##
                ##contesting whether two characters speaking in unison is a singular utterance##
                if len(utterance["speakers"]) >= 1:
                    for p in range(len(utterance["speakers"])):
                        user = User(utterance["speakers"][p])

                ##2.)Getting the root is relatively easy given how organized the dataset is already. This code replaces the ending of scenes##
                ##with the first utterance of the scene##
                part = id.split('_')[:3]
                part.append('u001')
                root = '_'.join(part)

                ##3.)There is a 'character_entities' subsection of each utterance that features all characters involved and or referenced.##
                ##Designating a reply from this data is theoretically more accurate than going to the previous utterance, but not all seasons##
                ##contain this metadata##
                if id.split('_')[3] == "u001":
                    reply_to = None
                else:
                    prior = scene['utterances'][place - 1]['speakers']
                    if len(prior) >= 1:
                        for p in range(len(prior)):
                            reply_to = prior[p]
                    else:
                        reply_to = None
                timestamp = None

                ##there's a tokenized version available in the data set too##
                text = utterance["transcript"]
                utterance_corpus[id] = Utterance(id, user, root, reply_to, timestamp, text)

To expand on the comments above:

2.) In thinking more about the qualities of syndicated televison, many scenes and conversations begin in medias res or correspond to a cliffhanger from before the commercial break. I can't think of a better to determine where a conversation starts, but I do think it's important to measure the composition and similarity of conversations across scenes.

3.) In measuring replies, the code-as-is assumes that the current utterance is responding to the one immediately before it. Honestly, I don't feel super comfortable with this assumption. For one, _Friends_ is known for having at least one two characters who introduce nonsequitirs into conversation. **A distinction should be made between if being a part of a scene and being a part of a conversation are the same thing, especially in a comedy**. I would be interested to see how well the character_entities data corresponds to assuming conversationality in a linear representation of a scene.

In [None]:
utterance_list = [utterance for k,utterance in utterance_corpus.items()]

In [None]:
series_corpus = Corpus(utterances=utterance_list, version=1)

In [None]:
convo_ids = series_corpus.get_conversation_ids()

Above are the three bits of code from the tutorial. Since not much worthwhile, additional metadata exits to add to this corpus, I have forgone it. If I was able to link IMDB to characters, however, I would also be able to get average ratings for each episode. This coudl be interesting in seeing if there's a generally positive response to certain cohorts of users.

In [None]:
print("number of conversations in the dataset = {}".format(len(series_corpus.get_conversation_ids())))

In [None]:
series_corpus.print_summary_stats()

**Transformations and Parsing**

This section involves parsing conversations in the series corpus. It takes quite a long time.

In [None]:
from convokit import Parser

In [None]:
annotator = Parser()

In [None]:
series_corpus = annotator.fit_transform(series_corpus)

**Custom Transformer: User Prominence**

This transformer separates and tags users into a series of tiers: Major, Major-Aspirational, and Minor. These classifcations rely on five metrics/

1.) Politeness Complexity: In a random sampling of conversations, how varied is the user's politeness.
2.) Utterances-per-conversation: In a random sampling of conversations, does the user's share of remarks account for more than the number of users/number of utterances per scene.
3.) First/Last Word: In a random sampling of conversations, how often did the user start or finish a conversation
4.) Spoken-of: Count of how many users use the user's name in speech
5.) Raw-count: User's number of utterances/ all utterances.


**Dalliances with Politeness**

In this section, I explore a few ways that politeness could be correlated with a character's longevity in the show.  Another way of framing this is do characters who have similar compositions of impolite|polite conversation to major characters have a higher likelihood of remaining on the show?

In [None]:
#this might be necessary#
import nltk
nltk.download('punkt')

In [None]:
#this certainly is necessary#
from convokit import PolitenessStrategies
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [None]:
ps = PolitenessStrategies(verbose=100)

In [None]:
politeness_corpus = ps.transform(series_corpus)

**Prominence Transformer**


In [None]:
from convokit import Prominence

In [None]:
pro = Prominence()

In [None]:
pro_corpus = pro.transform(politeness_corpus)

**k_means Clustering**

In [None]:
import pandas as pd
import numpy as np

In [None]:
characters = pro_corpus.get_usernames()
rows = []
for character in characters:
    rows.append(list(pro_corpus.get_user(character).meta.values()))
character_prom = pd.DataFrame(rows, index= characters)
    

In [None]:
character_prom

In [None]:
character_prom[0] = character_prom[0] / character_prom[0].max()
character_prom[1] = character_prom[1] / character_prom[1].max()
character_prom[2] = character_prom[2] / character_prom[2].max()
character_prom[3] = character_prom[3] / character_prom[3].max()

In [None]:
import sklearn

In [None]:
mat = character_prom.values
km = sklearn.cluster.KMeans(n_clusters=3)
km.fit(mat)
labels = km.labels_
results = pd.DataFrame([character_prom.index,labels]).T

**Metrics**

In [None]:
to_graph =[]
characters = pro_corpus.get_usernames()
for character in characters:
    user1 = series_corpus.get_user(character)
    utterances = len(user1.get_utterance_ids())
    to_graph.append((character, pro_corpus.get_user(character).meta["politeness_complexity"], utterances))

In [None]:
x = [i[1] for i in to_graph]
y = [i[2] for i in to_graph]
plt.xlabel('Complexity')
plt.ylabel('Utterances')
plt.title('Utterances to Politeness Complexity (All Characters)')
plt.scatter(x, y)
plt.savefig('fig1.png', dpi=200)
plt.show()

In [None]:
to_graph =[]
characters = pro_corpus.get_usernames()
for character in characters:
    user1 = series_corpus.get_user(character)
    utterances = len(user1.get_utterance_ids())
    if utterances < 500:
        to_graph.append((character, pro_corpus.get_user(character).meta["politeness_complexity"], utterances))

In [None]:
x = [i[1] for i in to_graph]
y = [i[2] for i in to_graph]
plt.xlabel('Complexity')
plt.ylabel('Utterances')
plt.title('Utterances to Politeness Complexity(Major Removed)')
plt.scatter(x, y)
plt.savefig('fig2.png', dpi=200)
plt.show()

In [None]:
to_graph2 = []
characters = pro_corpus.get_usernames()
for character in characters:
    user1 = series_corpus.get_user(character)
    utterances = len(user1.get_utterance_ids())
    to_graph2.append((character, pro_corpus.get_user(character).meta["utterance_per_conversation"], utterances))

In [None]:
x = [i[1] for i in to_graph2]
y = [i[2] for i in to_graph2]
plt.xlabel('Utterances Per Conversation')
plt.ylabel('Utterances')
plt.title('Utterances to Share of Conversation(All Characters)')
plt.savefig('fig3.png', dpi=200)
plt.scatter(x, y)
plt.show()

In [None]:
to_graph2 = []
characters = pro_corpus.get_usernames()
for character in characters:
    user1 = series_corpus.get_user(character)
    utterances = len(user1.get_utterance_ids())
    if utterances < 500:
        to_graph2.append((character, pro_corpus.get_user(character).meta["utterance_per_conversation"], utterances))

In [None]:
x = [i[1] for i in to_graph2]
y = [i[2] for i in to_graph2]
plt.xlabel('Utterances Per Conversation')
plt.ylabel('Utterances')
plt.title('Utterances to Share of Conversation(Major Removed)')
plt.savefig('fig4.png', dpi=200)
plt.scatter(x, y)
plt.show()

In [None]:
to_graph3 = []
characters = pro_corpus.get_usernames()
for character in characters:
    user1 = series_corpus.get_user(character)
    utterances = len(user1.get_utterance_ids())
    to_graph3.append((character, pro_corpus.get_user(character).meta["first_last_word"], utterances))

In [None]:
x = [i[1] for i in to_graph3]
y = [i[2] for i in to_graph3]
plt.savefig('fig5.png', dpi=200)
plt.scatter(x, y)
plt.show()

In [None]:
to_graph3 = []
characters = pro_corpus.get_usernames()
for character in characters:
    user1 = series_corpus.get_user(character)
    utterances = len(user1.get_utterance_ids())
    if utterances < 500:
        to_graph3.append((character, pro_corpus.get_user(character).meta["first_last_word"], utterances))

In [None]:
x = [i[1] for i in to_graph3]
y = [i[2] for i in to_graph3]
plt.savefig('fig6.png', dpi=200)
plt.scatter(x, y)
plt.show()