Shuyang Li, “INTERVIEW: NPR Media Dialog Transcripts.” Kaggle, doi: 10.34740/KAGGLE/DS/590180.

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import string
from altair import Chart, X, Y, Color, Scale
import altair as alt
from vega_datasets import data
import matplotlib
import matplotlib.pyplot as plt
import psycopg2
import wordcloud
import textatistic
import seaborn as sbn
import requests
import json
from bs4 import BeautifulSoup
from collections import Counter
import re
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
nltk.download('stopwords') 
nltk.download('punkt')

matplotlib.style.use('ggplot')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nanamathis/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nanamathis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nanamathis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Reading in data
#### IGNORE THIS RN - If you want to skip reading, cleaning, and merging, go to the header labeled: [Download Ready To Go Data](#Download-Ready-To-Go-Data)

### First reading the .csv files:
- episodes: information on each episode by id, includes the program it's under, title, and date which is parsed as a date
- utter2: record of episode content when there are only 2 people in speaking in the episode
- utter: record of episode content when there are any number of people in speaking in the episode

In [2]:
episodes = pd.read_csv('interview-npr-media-dialog-transcripts/episodes.csv',parse_dates=['episode_date'])
# utter2 = pd.read_csv('interview-npr-media-dialog-transcripts/utterances-2sp.csv')
utter = pd.read_csv('interview-npr-media-dialog-transcripts/utterances.csv')

### Now reading the .json files:
- host_ids: host names with their id number
- host_map: by host id, includes name, list of episodes id's, and list of programs 
- test_train_valid: the data split into train, test, validate for any modeling

In [3]:
host_ids = pd.read_json('interview-npr-media-dialog-transcripts/host_id.json',orient='index')
host_map = pd.read_json('interview-npr-media-dialog-transcripts/host-map.json',orient='index')
test_train_valid = pd.read_json('interview-npr-media-dialog-transcripts/splits-ns2.json',orient='index')

# Cleaning the data
Right now we don't need the train_test_valid dataset so we will leave it alone. We will be cleaning and organizing the rest of the data.

In the episodes dataset we will rename 'id' to be 'episode_id' to be clear since the hosts have ids as well. 

In [4]:
episodes.rename(columns={'id':'episode_id'},inplace=True)
# there's only one episodes in 1999 and we will use the id to remove it from the other data
nineteen99 = episodes[episodes['episode_date'].dt.year == 1999]
# removing episodes during 1999 since there's only one and the years jump from 
# 1999 to 2004 and then it's consecutive by year from 2004 until 2019
episodes = episodes[~(episodes['episode_date'].dt.year == 1999)]
episodes.head()

Unnamed: 0,episode_id,program,title,episode_date
0,98814,Morning Edition,Senate Ushers In New Year With 'Fiscal Cliff' ...,2013-01-01
1,98824,Morning Edition,Cheap Bubbly Or Expensive Sparkling Wine? Look...,2012-12-31
2,98821,Morning Edition,U.S. Gas Prices Reach Record Level In 2012,2013-01-01
3,98806,Morning Edition,House Approves 'Fiscal Cliff' Measure,2013-01-02
4,98823,Morning Edition,The Fiscal Cliff: A Love Story,2012-12-31


I don't know if I'll use the utter2 dataset since the utter dataset covers the episodes in it as well but we will change the name of the column 'id' to 'episode_id' to match the other dataset variables. We will also clean the text of what they said so it's all uniform.

In [5]:
utter.rename(columns={'episode':'episode_id'},inplace=True)
# utter2.rename(columns={'episode':'episode_id'},inplace=True)
# remove all data from the one episode that took place in 1999 with the episode_id of 141179
utter = utter[~(utter['episode_id'] == nineteen99.iloc[0]['episode_id'])]
# utter2 = utter2[~(utter2['episode_id'] == nineteen99.iloc[0]['episode_id'])]

In [6]:
# convert all text to lower case
utter['utterance'] = utter.utterance.str.lower()
# utter2['utterance'] = utter2.utterance.str.lower()

In [7]:
# remove strange character
utter['utterance'] = utter.utterance.str.replace('\ufeff','') 
# utter2['utterance'] = utter2.utterance.str.replace('\ufeff','')

In [8]:
# eliminate whitespace from beginning and end
utter['utterance'] = utter.utterance.str.strip() 
# utter2['utterance'] = utter2.utterance.str.strip()

In [9]:
# clean up speaker column to make all host names be lower case as well as
# remove the text ", host" so it will match with the hosts datasets
utter['speaker'] = utter.speaker.str.lower()
utter['speaker'] = utter.speaker.str.replace(', host','',regex=False)

### Time to clean the text even more! It might have been more efficient to do this before I split and grouped the data in the previous two cells but I'm not sure due to the format it was in as a list of sentences rather than a string. I might try it later but for now we will do it this way.

In [18]:
def replacePunct(S):
    """
    Replaces punctuation with whitespace, eliminating 
    punctuation in a string. Returns the updated string.
    input:
    S (string) - a string that has punctuation characters
    output:
    (string) - new string with punctuation replaced with a space
    """ 
    return str(S).translate(str.maketrans(
        string.punctuation, ' '*len(string.punctuation)))

sw = set(stopwords.words('english'))

# def preprocess(S):
#     """
#     """
#     return [word for word in nltk.word_tokenize(text) 
#             if word not in sw and not word.isdigit()]

In [11]:
# replace punctuation with spaces
utter['utterance'] = utter.utterance.apply(replacePunct)

In [12]:
# splits the text into a list of words/strings
# add a new column, called utter_list, to our dataframe,
# by applying the function nltk.word_tokenize to the text column.
utter['utter_list'] = utter.utterance.apply(nltk.word_tokenize)

In [13]:
# get stopwords, i.e. is, am, etc.

# this keeps only regular English words and removes common words such as is, am, etc.
utter['utter_list'] = utter.utter_list.apply(
    lambda x: [y for y in x if y not in sw])

Cleaning up host_map so it makes more sense and organized. Also rename the name column to be speaker so that we can merge it with the episodes dataset. We don't really need this since we create basically the same thing later on but in synch with the rest of the data lists...

In [14]:
# the index actually corresponds to the host_id so we are
# sorting by index and then reseting the index to be a column
host_map.sort_index(axis=0,inplace=True)
host_map.reset_index(inplace=True)
# renaming index column to be host_id and reordering the columns
host_map.rename(columns={'index':'host_id','name':'speaker'},inplace=True)
host_map = host_map[['speaker','host_id','episodes','programs']]

In [15]:
# TODO: clean up names for hosts, it's an issue in both host_ids and host_map
# there are double counts like "text melissa block" and "melissa block" and
# "mr. neal conan" vs. "neal conan"
# some aren't even included in the host data, i.e. mara liasson, byline is listed as guest
# x = host_map.name.str.extractall('(global)(\s\w+)',flags=re.IGNORECASE)

Cleaning up host_ids to be sorted by their id, we also renamed the name column to be speaker so that we can merge it with the episodes dataset.

In [16]:
host_ids.sort_values(by=0,ascending=True,inplace=True)
host_ids.reset_index(inplace=True)
host_ids.rename(columns={'index':'speaker',0:'host_id'},inplace=True)

# Merging datasets

epScript is a dataframe merging episodes and utter so that we have all the information of an episode and the script. We will also add the host_id to this frame and fill guest speakers' id or any unfilled values as -1. Then we will sort the dataframe by episode_id following episode order. Then we sort the episodes by date and if there are multiple episodes on a day, by their episode_id and also the order of the script, episode_order. We will also keep track of the number of words in a given utterance with 'word_count.'

In [19]:
epScript = pd.merge(utter,episodes,on='episode_id',how='left')
epScript = pd.merge(epScript,host_ids,on='speaker',how="left")
epScript.fillna(-1,inplace=True)
epScript.sort_values(by=['episode_date','episode_id','episode_order'],inplace=True,ascending=True)
epScript.reset_index(drop=True,inplace=True)
epScript['word_count'] = epScript.utter_list.str.len()
epScript.head()

Unnamed: 0,episode_id,episode_order,speaker,utterance,utter_list,program,title,episode_date,host_id,word_count
0,85414,0,liane hansen,on friday cia director george tenet announced...,"[friday, cia, director, george, tenet, announc...",Weekend Edition Sunday,Iraq WMD Questioned,2004-01-25,117.0,88
1,85414,1,david kay,good morning i m happy to be with you hanse...,"[good, morning, happy, hansen, since, deliveri...",Weekend Edition Sunday,Iraq WMD Questioned,2004-01-25,-1.0,72
2,85414,2,liane hansen,so prior to last year s invasion and your repo...,"[prior, last, year, invasion, report, october,...",Weekend Edition Sunday,Iraq WMD Questioned,2004-01-25,117.0,9
3,85414,3,david kay,not very much i think that s true,"[much, think, true]",Weekend Edition Sunday,Iraq WMD Questioned,2004-01-25,-1.0,3
4,85414,4,liane hansen,have you determined that you re never going to...,"[determined, never, going, find, clear, eviden...",Weekend Edition Sunday,Iraq WMD Questioned,2004-01-25,117.0,9


# break

In [20]:
# I found out that the way I aggregate the utterances together to be a huge list of words, there is an extra 
# [ and ' in the string of the first index and ' and ] in the string of the last index.
def remExtraBrackets(L):
    """
    Removes the extra apostrophes and brackets 
    at the first and last index of a list. 
    Returns the updated list as a string.
    input:
    L (list) - a list of strings with the first and last elements having 2 extra characters in the front and back respectively
    output:
    (string) - string form of the changed list
    """ 
    L[0] = L[0][2:]
    L[-1] = L[-1][:-2]
    # return a string because later we will make a list of the important words
    return ' '.join(L)

Creating a dataframe, byEpisode, that has been grouped by the episode_id and the non-repeating valued columns will then be concatenated to a list. We know that the index of one list, say at index 0 in host_id, correlates to the speaker name at index 0. We also will add a row to keep the total count of words said in a given episode with 'total_count.'

In [21]:
byEpisode = epScript.groupby(['episode_id','program','title','episode_date']
                             )['speaker','utterance','utter_list','host_id','word_count'].agg(list).reset_index()
byEpisode.sort_values(by=['episode_date','episode_id'],inplace=True,ascending=True)
byEpisode.reset_index(drop=True,inplace=True)
byEpisode['total_count'] = byEpisode.word_count.apply(lambda l: np.nansum(l))
byEpisode.head()

Unnamed: 0,episode_id,program,title,episode_date,speaker,utterance,utter_list,host_id,word_count,total_count
0,85414,Weekend Edition Sunday,Iraq WMD Questioned,2004-01-25,"[liane hansen, david kay, liane hansen, david ...",[on friday cia director george tenet announce...,"[[friday, cia, director, george, tenet, announ...","[117.0, -1.0, 117.0, -1.0, 117.0, -1.0, 117.0,...","[88, 72, 9, 3, 9, 64, 7, 38, 12, 50, 13, 13, 3...",1197
1,85620,Weekend Edition Saturday,Iraqis Vote for Local Council,2004-02-14,"[scott simon, scott simon, emily harris report...",[in iraq earlier today guerrillas attacked a ...,"[[iraq, earlier, today, guerrillas, attacked, ...","[12.0, 12.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[28, 58, 11, 2, 3, 31, 17, 48, 24, 38, 3, 25, ...",454
2,135286,Morning Edition,Study Sheds Light on Compulsive Hoarding,2004-06-07,"[steve inskeep, michelle trudeau reporting, ri...",[a new study focuses on the brains of compulsi...,"[[new, study, focuses, brains, compulsive, hoa...","[16.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[31, 18, 19, 37, 5, 4, 13, 19, 11, 4, 16, 11, ...",650
3,134619,Morning Edition,Toni Morrison's 'Good' Ghosts,2004-09-20,"[_no_speaker, steve inskeep, steve inskeep, st...","[nan, nan, this is morning edition from npr ne...","[[nan], [nan], [morning, edition, npr, news, s...","[-1.0, 16.0, 16.0, 16.0, 16.0, 16.0, 5.0, 5.0,...","[1, 1, 6, 1, 33, 1, 1, 34, 1, 19, 1, 30, 1, 2,...",627
4,84651,Talk of the Nation,"Calls for States' Rights, from the Left",2004-11-30,"[neal conan, neal conan, neal conan, neal cona...",[this is talk of the nation i m neal conan i...,"[[talk, nation, neal, conan, washington], [yea...","[7.0, 7.0, 7.0, 7.0, 7.0, 7.0, -1.0, 7.0, -1.0...","[5, 75, 41, 39, 16, 3, 2, 20, 34, 20, 30, 4, 4...",3375


Creating a dataframe, byHost, that has been grouped by the speaker who is a host (has a host_id of not -1) and the non-repeating valued columns will then be concatenated to a list. We know that the index of one list, say at index 0 in episode_id, correlates to the program at index 0. We also will add a row to keep the total count of words said in a given episode with 'total_count.'

In [None]:
# # we already know non hosts have a host_id of -1 since we set that above
# byHost = epScript[~(epScript['host_id'] == -1)].groupby(['speaker','host_id']
#                                                        )['episode_id','episode_order','utterance','utter_list',
#                                                          'program','title','episode_date','word_count'
#                                                         ].agg(list).reset_index()
# byHost.sort_values(by=['speaker','host_id'],inplace=True,ascending=True)
# byHost.reset_index(drop=True,inplace=True)
# byHost['total_count'] = byHost.word_count.apply(lambda l: np.nansum(l))
# byHost.head()

# Preparing DataFrames

#### Grouping the episodes by year where utterance has all scripts from that year as a string and avg_epCount is the average length of an episode that year.

In [22]:
# gathering text by year
textByYear = byEpisode.groupby(byEpisode['episode_date'].dt.year
                               ).agg({'utterance': lambda l:
                                      [word for sentence in l for word in str(sentence).split()],
                                      'utter_list': lambda l: 
                                        [item for sublist in l for item in sublist],
                                      'total_count': 'mean',
                                     }).reset_index()
textByYear['utterance'] = textByYear.utterance.apply(remExtraBrackets)
textByYear['utter_list'] = textByYear.utter_list.apply(lambda l: [item for sublist in l for item in sublist])
textByYear.rename(columns={'total_count':'avg_epCount'},inplace=True)
textByYear.head()

Unnamed: 0,episode_date,utterance,utter_list,avg_epCount
0,2004,on friday cia director george tenet announced ...,"[friday, cia, director, george, tenet, announc...",1064.285714
1,2005,this is talk of the nation i m neal conan in w...,"[talk, nation, neal, conan, washington, month,...",587.587842
2,2006,from npr news this is weekend edition i m lian...,"[npr, news, weekend, edition, liane, hansen, v...",577.478093
3,2007,from npr news this is news notes i m farai chi...,"[npr, news, news, notes, farai, chideya, outse...",570.710231
4,2008,from npr news this is news notes i m farai chi...,"[npr, news, news, notes, farai, chideya, first...",588.16344


#### Grouping the episodes by program where utterance has all scripts from that program as a string and avg_epCount is the average length of an episode for that program

In [None]:
# textByProg = byEpisode.groupby(byEpisode['program']
#                                ).agg({'utterance': lambda l:
#                                       [word for sentence in l for word in str(sentence).split()],
#                                       'utter_list': lambda l: 
#                                         [item for sublist in l for item in sublist],
#                                       'total_count': 'mean',
#                                      }).reset_index()
# textByProg['utterance'] = textByProg.utterance.apply(remExtraBrackets)
# textByProg['utter_list'] = textByProg.utter_list.apply(lambda l: [item for sublist in l for item in sublist])
# textByProg.rename(columns={'total_count':'avg_epCount'},inplace=True)

# textByProg.head()

Grouping the episodes by host where utterance has all scripts from that host as a string and avg_epCount is the average length of an episode for that host

In [None]:
# textByHost = byHost.groupby(byHost['speaker']
#                                ).agg({'utterance': lambda l:
#                                         [word for sentence in l for word in str(sentence).split()],
#                                       'utter_list': lambda l: 
#                                         [item for sublist in l for item in sublist],
#                                       'total_count': 'mean',
#                                      }).reset_index()
# textByHost['utterance'] = textByHost.utterance.apply(remExtraBrackets)
# textByHost['utter_list'] = textByHost.utter_list.apply(lambda l: [item for sublist in l for item in sublist])
# textByHost.rename(columns={'total_count':'avg_epCount'},inplace=True)
# textByHost.head()

# IGNORE - Download Ready To Go Data
You can use this to get the DataFrames without doing the work above. First run the cell below and then if you want to skip more cleaning and prepping then you can skip to: [Download More Ready To Go Data](#Download-More-Ready-To-Go-Data!)

In [None]:
# epScript = pd.read_csv('from-notebook/transcripts-divided.csv',parse_dates=['episode_date'])
# bySpeaker = pd.read_csv('from-notebook/transcripts-by-speaker.csv',parse_dates=['episode_date'])
# byEpisode = pd.read_csv('from-notebook/transcripts-by-episode.csv',parse_dates=['episode_date'])
# textByYear = pd.read_csv('from-notebook/transcripts-by-year.csv',index_col=0)
# textByProg = pd.read_csv('from-notebook/transcripts-by-program.csv',index_col=0)
# textByHost = pd.read_csv('from-notebook/transcripts-by-host.csv',index_col=0)

# More Prepping

Some functions to help with charting and plotting the frequency of words

In [None]:
# def createCounter(df):
#     """
#     Creates and returns a Counter for df on the words in 'utter_list'
#     input:
#     df (DataFrame) - DataFrame you want to count
#     output:
#     (Counter) - a Counter that has summed the occurences of each word 
#     in 'utter_list' of the DataFrame
#     """ 
#     return Counter(df.utter_list.sum())
# def dictOfCounters(df,L):
#     for e in L:
#         counter = createCounter(df)

In [None]:
# yearCounters = {}
# for year in textByYear.episode_date:
#     counter = createCounter(textByYear)
#     yearCounters[year] = counter
#     print('finished: ',year)

In [None]:
# progCounters = {}
# for program in textByProg.program:
#     counter = createCounter(textByProg)
#     progCounters[program] = counter
#     print('finished: ',program)

In [None]:
# # this doesn't represent the entire dataset since only using a portion of 
# # the hosts (top 10 hosts that speak the most) and excluding the guest speakers
# hostCounters = {}
# top10Hosts = textByHost.sort_values(by='avg_epCount',inplace=False,ascending=False)[:10]
# for host in top10Hosts.speaker:
#     counter = createCounter(textByHost)
#     hostCounters[host] = counter
#     print('finished: ',host)

# IGNORE - Download More Ready To Go Data!

In [None]:
# with open('from-notebook/yearCounters.txt', 'r') as file:
#     yearCounters = json.load(file)
# with open('from-notebook/progCounters.txt', 'r') as file:
#     progCounters = json.load(file)
# with open('from-notebook/hostCounters.txt', 'r') as file:
#     hostCounters = json.load(file)

In [None]:
# def toDictOfCounters(dictionary):
#     """
#     Converts a dictionary of dictionarys with words 
#     and counts and their respective keys and values 
#     to a dictionary of Counters
#     input:
#     dictionary (dict) - dictionary of dictionarys 
#     whose keys and values are words and their counts
#     output:
#     (dict) - a dictionary of Counters
#     """
#     for key in dictionary.keys():
#         dictionary[key] = Counter(dictionary[key])
#     return dictionary

In [None]:
# yearCounters = toDictOfCounters(yearCounters)
# progCounters = toDictOfCounters(progCounters)
# hostCounters = toDictOfCounters(hostCounters)

# Analyzing Data

### Frequency Analysis

In [None]:
# def countTopics(df, topics):
#     """
#     Creates and returns a Counter for df on the words in 'utter_list'
#     input:
#     df (DataFrame) - DataFrame you want to count
#     topics (list) - a list of strings that are topics to count
#     output:
#     nothing - this function counts the occurences of each string in
#     topics in df.utterance column and adds that count to a column 
#     with the name of the topic. instead of returning a new DataFrame,
#     it changes in place the given df
#     """
#     for t in topics:
#         df[t] = df.utterance.str.count(t)

In [None]:
# topics1 = ['women','mexico','united states',
#           'violence', 'health', 'education',
#           'border', 'president', 'isis',
#           'sexual assault', 'trump', 'terrorism']
# topics2 = ['women','politics','united states',
#           'violence', 'global warming', 'education',
#           'border', 'climate change', 'isis',
#           'sexual assault', 'trump', 'terrorism']
# topics3 = ['women','men','assault','reproduction',
#           'violence', 'education','marriage','trump',
#           'sexual assault','love', 'fact','sorry']

In [None]:
# year_summ = textByYear.groupby('episode_date', as_index=False)['utterance'].sum()
# countTopics(year_summ, topics1)
# year_summ = year_summ[['episode_date']+topics1]

# # Altair works better when data is in wide_form so we can easily 
# # convert the long form DataFrame using melt().
# year_summ = year_summ.melt('episode_date', var_name='word', value_name='count')

In [None]:
# # Currently looking at the words by years with each having its own graph
# alt.Chart(year_summ).mark_line().encode(
#     x='episode_date:O',
#     y='count:Q',
#     color='word:N'
# ).properties(
#     width=180,
#     height=180
# ).facet(
#     facet='word:N',
#     columns=3
# ).interactive()

Combine the graphs to show all the words over all the years.

In [None]:
# # 6. Make a graph to show the frequency with which various topics are discussed over the years. For example, ‘peace’
# # is consistently a popular word as is ‘freedom’ and ‘human rights’. What about ‘HIV’ or ‘terrorism’ or ‘global
# # warming’. Compare two phrases like ‘global warming’ and ‘climate change’.
# yearChart = alt.Chart(year_summ).mark_line().encode(
#     x='episode_date:O',
#     y='count:Q',
#     color='word:N'
# )
# yearChart

In [None]:
# prog_summ = textByProg.groupby('program', as_index=False)['utterance'].sum()
# countTopics(prog_summ, topics2)
# prog_summ = prog_summ[['program']+topics2]

# # Altair works better when data is in wide_form so we can easily 
# # convert the long form DataFrame using melt().
# prog_summ = prog_summ.melt('program', var_name='word', value_name='count')

In [None]:
# progChart = alt.Chart(prog_summ).mark_line().encode(
#     x='program:N',
#     y='count:Q',
#     color='word:N'
# )

In [None]:
# top10Hosts = textByHost.sort_values(by='avg_epCount',inplace=False,ascending=False)[:10]
# host_summ = top10Hosts.groupby('speaker', as_index=False)['utterance'].sum()
# countTopics(host_summ, topics3)
# host_summ = host_summ[['speaker']+topics3]

# # Altair works better when data is in wide_form so we can easily 
# # convert the long form DataFrame using melt().
# host_summ = host_summ.melt('speaker', var_name='word', value_name='count')

In [None]:
# hostChart = alt.Chart(host_summ).mark_line().encode(
#     x='speaker:N',
#     y='count:Q',
#     color='word:N'
# )

#### WordClouds reflecting frequency

In [None]:
# # make WordCloud from the counter
# def createWordCloud(counter):
#     import wordcloud
#     wc = wordcloud.WordCloud(background_color ='white')
#     # generate word cloud
#     wc.generate_from_frequencies(frequencies=counter)
#     return wc

In [None]:
# fig = plt.figure()
# i=0
# for prog in progCounters.keys():
#     ax = fig.add_subplot(2,1,i+1)
#     wordcloud = createWordCloud(progCounters[prog])
#     ax.imshow(wordcloud)
#     ax.axis('off')
#     i+=1

In [None]:
#     plt.imshow(wc, interpolation="bilinear")
#     plt.axis("off")
#     plt.show()
# createWordCloud(progCounters['All Things Considered']

#### Charting average words per episode

In [23]:
avgWordsPerEp = byEpisode['total_count'].mean()

# I created a dataframe that just has each year and the avergage
# length of all speeches called source
source = pd.DataFrame({
  'year': textByYear.episode_date,
  'avg': avgWordsPerEp
})
# so that I can create a single horizontal graph that I can
# overlay it with the byYeargraph!
avg = alt.Chart(source).mark_rule(color='red').encode(
    y='avg'
)

In [30]:
avg

In [29]:
alt.Chart(textByYear).mark_bar().encode(
    x='episode_date:O',
    y='avg_epCount:Q',
    color='episode_date:O'
)

In [None]:
# # bars for each year's own average speech length
# byYearBar = alt.Chart(textByYear).mark_bar().encode(
#     x='episode_date:O',
#     y='avg_epCount:Q',
#     color='episode_date:O'
# )

# # compare each country to the average speech length
# compare = (byYearBar+avg).properties(width=2000)
# compare

In [None]:
# # bars for each year's own average speech length
# byProgBar = alt.Chart(textByProg).mark_bar().encode(
#     x='program:N',
#     y='avg_epCount:Q',
#     color='program:N'
# )

# # compare each country to the average speech length
# compare = (byProgBar+avg).properties(width=2000)
# compare

In [None]:
# # bars for each year's own average speech length
# byHostBar = alt.Chart(textByHost).mark_bar().encode(
#     x='speaker:N',
#     y='avg_epCount:Q',
#     color='speaker:N'
# )

# # compare each country to the average speech length
# compare = (byHostBar+avg).properties(width=2000)
# compare

### Sentiment Analysis

In [None]:
# analyzer = SentimentIntensityAnalyzer()

# # credit goes to Google/Runestone for creating this function
# def score_text(text):
#     """
#     Calculates and returns the sentiment score of a text string
#     input:
#     text (string) - text you're analyzing the sentiment of
#     output:
#     (int) - a score between -1 and 1. -1 is the most negative
#     sentiment score a text can get and +1 is the most positive 
#     """ 
#     sentence_list = tokenize.sent_tokenize(text)
#     cscore = 0.0
#     for sent in sentence_list:
#         ss = analyzer.polarity_scores(sent)['compound']
#         cscore += ss
#     return cscore / len(sentence_list)

In [None]:
# epScript['sentiment'] = undf.text.map(lambda t : score_text(t))

In [None]:
# alt.data_transformers.enable('json')
# alt.Chart(epScript).mark_bar().encode(x=X('sentiment', bin=True), y='count()')

In [None]:
# sentByEpisode = epScript.groupby(['episode_id','program','title','episode_date'])['sentiment'].mean
# sentBySpeaker = epScript.groupby(['speaker','host_id'])['sentiment'].mean()
# sentByHost = pd.merge(host_ids,bySpeaker,on=['speaker','host_id'],how='left')
# sentByYear = epScript.groupby(['episode_date'].dt.year)['sentiment'].mean()
# sentByProgram = epScript.groupby(['program'])['sentiment'].mean()

#### Charting Sentiment

In [None]:
# # find avg sentiment
# avgSent = epScript.sentiment.mean()

# # I created a dataframe that just has each year and the avergage
# # sentiment in speeches
# source = pd.DataFrame({
#   'year': sentByYear.year,
#   'avg': avgSent
# })
# # so that I can create a single horizontal graph that I can
# # overlay it with the byYear graph
# avg = alt.Chart(source).mark_rule(color='red').encode(
#     y='avg'
# )

In [None]:
# # average sentiment over the years
# byYear = alt.Chart(sentByYear).mark_line().encode(
#     x='year:O',
#     y='sentiment:Q',
# )

# # compare each year's sentiment to the average
# compare = (byYear+avg)
# compare

In [None]:
# a chart that shows where regions are in terms of starting and mid-career salaries
# alt.Chart(typeAndRegion).mark_circle().encode(
#     x='Starting Median Salary',
#     y='Mid-Career Median Salary',
#     color='School Type:N',
#     tooltip=['School Name']
# ).properties(
#     width=400,
#     height=300,
#     title='Change in Salary by School Type'
# ).interactive()

# creates multiple heatmaps for each column in variables
# alt.Chart(states).mark_geoshape().encode(
#     alt.Color(alt.repeat('row'), type='quantitative')
# ).transform_lookup(
#     lookup='id',
#     from_=alt.LookupData(everything, 'id', variables)
# ).properties(
#     width=500,
#     height=300
# ).project(
#     type='albersUsa'
# ).repeat(
#     row=variables
# ).resolve_scale(
#     color='independent'
# )

# Saving Data
If you want to save everything created for easy use later

These DataFrames take a while to create so that's why you might want to just save them once you've done it once, they do take over 3 gigabytes of memory.

In [None]:
# textByYear.to_csv(r'from-notebook/transcripts-by-year.csv')
# textByProg.to_csv(r'from-notebook/transcripts-by-program.csv')
# textByHost.to_csv(r'from-notebook/transcripts-by-host.csv')

The Counters also take a while to create so you can save them to a .txt file and read them in with the code at [Download More Ready To Go Data](#Download-More-Ready-To-Go-Data)

In [None]:
# for year in textByYear.episode_date:
#     yearCounters[year] = dict(yearCounters[year])

# for program in textByProg.program:
#     progCounters[program] = dict(progCounters[program])

# for host in top10Hosts:
#     hostCounters[host] = dict(hostCounters[host])

# with open('from-notebook/yearCounters.txt', 'w') as file:
#     json.dump(yearCounters, file)

# with open('from-notebook/progCounters.txt', 'w') as file:
#     json.dump(progCounters, file)

# with open('from-notebook/hostCounters.txt', 'w') as file:
#     json.dump(hostCounters, file)

These files are really big so you might just want to rerun the code (it doesn't take too long) rather than saving because they have the same data but rearranged.

In [None]:
# epScript.to_csv(r'from-notebook/transcripts-divided.csv')
# bySpeaker.to_csv(r'from-notebook/transcripts-by-speaker.csv')
# byEpisode.to_csv(r'from-notebook/transcripts-by-episode.csv')

Unnecessary to save to csv so you should ignore this unless you REALLY want to and have a lot of room on your computer.

In [None]:
# utter.to_csv(r'from-notebook/updated-utter.csv')
# utter2.to_csv(r'from-notebook/updated-utter2.csv')
# episodes.to_csv(r'from-notebook/updated-episodes.csv')
# host_ids.to_csv(r'from-notebook/updated-host_ids.csv')
# host_map.to_csv(r'from-notebook/updated-host_map.csv')

# Unused Code

In [None]:
# DIDN'T END UP USING THESE BECAUSE I DIDN'T ANALYZE FREQUENCY HOW I THOUGHT I WOULD

# def createTopX(x, counter):
#     """
#     Creates and returns a dictionary of the top x words in counter.
#     input:
#     x (int) - int of the most frequent words desired
#     counter (Counter) - Counter from which we get the words from
#     output:
#     (dict) - a dictionary that has x keys as the words and their
#     corresponding values are the number of times it has appeared
#     """ 
#     return dict(counter.most_common(x))

# # mapping the two lists to the word and their count
# def map_counts(words, counts, d):
#     """
#     Maps words and their counts from a dictionary, d, to two separate lists, 
#     words and counts, in which their indices correspond with their values.
#     input:
#     words (list) - empty list to fill
#     counts (list) - empty list to fill
#     d (dict) - dictionary of words and their counts
#     output:
#     nothing - this function updates the already existing lists passed
#     and does not return anything
#     """ 
#     for word in d:
#         words.append(word)
#         counts.append(d[word])

# # create dataframe for each x (could be year, program, host, etc.) that has the words and frequency
# def createFreqDF(x,topX,dfName):
#     """
#     Creates and returns a dictionary of the top x words in counter.
#     input:
#     x (object) - by what type are we recording word count by, i.e. 
#     are we creating a DataFrame that is number of words by each year,
#     each program, host, or speaker?
#     topX (dict) - dictionary from which words we want to include
#     dfName (string) - allows us to return a DataFrame with this as
#     the variable name, doesn't exist outside of this function
#     output:
#     (DataFrame) - a DataFrame that has 3 columns, word, count, x
#     """ 
#     dfName = pd.DataFrame(columns=['year','word','count'])
#     words = []
#     counts = []
#     map_counts(words,counts,topX[x])
#     dfName['word'] = words
#     dfName['count'] = counts
#     dfName[x] = [x]*len(words)
#     return dfName

# yearTop20 = {}
# for year in textByYear.episode_date:
#     yearTop20[year] = createTopX(20,yearCounters[year])
# yearFreqDF = {}
# for year in textByYear.episode_date:
#     name='freq'+str(year)
#     yearFreqDF[year] = createFreqDF(year,yearTop20,name)