# Data Cleaning

In [1]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="ast-container").find_all('p')]
    print(url)
    return text

# URLs of transcripts in scope
urls = ['https://scrapsfromtheloft.com/comedy/mike-epps-under-rated-never-faded-x-rated-transcript/',
        'https://scrapsfromtheloft.com/comedy/dave-chappelle-monologue-snl-2022-transcript/',
        'https://scrapsfromtheloft.com/comedy/iliza-shlesinger-hot-forever-transcript/',
        'https://scrapsfromtheloft.com/comedy/gabriel-iglesias-stadium-fluffy-transcript/',
        'https://scrapsfromtheloft.com/comedy/fortune-feimster-good-fortune-transcript/',
        'https://scrapsfromtheloft.com/comedy/deon-cole-charleens-boy-transcript/',
        'https://scrapsfromtheloft.com/comedy/neal-brennan-blocks-transcript/',
        'https://scrapsfromtheloft.com/comedy/trevor-noah-i-wish-you-would-transcript/',
        'https://scrapsfromtheloft.com/comedy/whitney-cummings-jokes-transcript/',
        'https://scrapsfromtheloft.com/comedy/kate-berlant-cinnamon-in-the-wind-transcript/',
        'https://scrapsfromtheloft.com/comedy/patton-oswalt-we-all-scream-transcript/',
        'https://scrapsfromtheloft.com/comedy/bill-burr-live-at-red-rocks-transcript/']

# Comedian names
comedians = ['mike', 'dave', 'iliza', 'gabriel', 'fortune', 'deon', 'neal', 'trevor', 'whitney', 'kate', 'patton', 'bill']

In [2]:
# # Actually request transcripts (takes a few minutes to run)
transcripts = [url_to_transcript(u) for u in urls]

https://scrapsfromtheloft.com/comedy/mike-epps-under-rated-never-faded-x-rated-transcript/
https://scrapsfromtheloft.com/comedy/dave-chappelle-monologue-snl-2022-transcript/
https://scrapsfromtheloft.com/comedy/iliza-shlesinger-hot-forever-transcript/
https://scrapsfromtheloft.com/comedy/gabriel-iglesias-stadium-fluffy-transcript/
https://scrapsfromtheloft.com/comedy/fortune-feimster-good-fortune-transcript/
https://scrapsfromtheloft.com/comedy/deon-cole-charleens-boy-transcript/
https://scrapsfromtheloft.com/comedy/neal-brennan-blocks-transcript/
https://scrapsfromtheloft.com/comedy/trevor-noah-i-wish-you-would-transcript/
https://scrapsfromtheloft.com/comedy/whitney-cummings-jokes-transcript/
https://scrapsfromtheloft.com/comedy/kate-berlant-cinnamon-in-the-wind-transcript/
https://scrapsfromtheloft.com/comedy/patton-oswalt-we-all-scream-transcript/
https://scrapsfromtheloft.com/comedy/bill-burr-live-at-red-rocks-transcript/


In [3]:
# Pickle files for later use

# Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

In [5]:
# Load pickled files
data = {}
for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [6]:
# Let's take a look at our data again
next(iter(data.keys()))

'mike'

In [None]:
# Notice that our dictionary is currently in key: comedian, value: list of text format
next(iter(data.values()))

In [8]:
# We are going to change this to key: comedian, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [9]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [None]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

In [None]:
# Let's take a look at the transcript for Ali Wong
data_df.transcript.loc['neal']

In [12]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [None]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

In [14]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…*]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [None]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

In [None]:
# Let's take a look at our dataframe
data_df

In [None]:
# Let's add the comedians' full names as well
full_names = ['Mike Epps', 'Dave Chappelle', 'Iliza Shlesinger', 'Gabriel Iglesias', 'Fortune Feimster', 'Deon Cole',
              'Neal Brennan', 'Trevor Noah', 'Whitney Cummings', 'Kate Berlant', 'Patton Oswalt', 'Bill Burr']

data_df['full_name'] = full_names
data_df

In [18]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")

In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

In [20]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [21]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))

##ASSIGNMENT -3


# Exploratory Data Analysis

## Introduction

After the data cleaning step where we put our data into a few standard formats, the next step is to take a look at the data and see if what we're looking at makes sense. Before applying any fancy algorithms, it's always important to explore the data first.

When working with numerical data, some of the exploratory data analysis (EDA) techniques we can use include finding the average of the data set, the distribution of the data, the most common values, etc. The idea is the same when working with text data. We are going to find some more obvious patterns with EDA before identifying the hidden patterns with machines learning (ML) techniques. We are going to look at the following for each comedian:

1. **Most common words** - find these and create word clouds
2. **Size of vocabulary** - look number of unique words and also how quickly someone speaks
3. **Amount of profanity** - most common terms

## Most Common Words

### Analysis

In [None]:
# Read in the document-term matrix
import pandas as pd

data = pd.read_pickle('dtm.pkl')
data = data.transpose()
data.head()


In [None]:
# Find the top 30 words said by each comedian
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

In [None]:
# Print the top 15 words said by each comedian
for comedian, top_words in top_dict.items():
    print(comedian)
    print(', '.join([word for word, count in top_words[0:14]]))
    print('---')

**NOTE:** At this point, we could go on and create word clouds. However, by looking at these top words, you can see that some of them have very little meaning and could be added to a stop words list, so let's do just that.



In [None]:
# Look at the most common top words --> add them to the stop word list
from collections import Counter

# Let's first pull out the top 30 words for each comedian
words = []
for comedian in data.columns:
    top = [word for (word, count) in top_dict[comedian]]
    for t in top:
        words.append(t)
        
words

In [None]:
# Let's aggregate this list and identify the most common words along with how many routines they occur in
Counter(words).most_common()

In [None]:
# If more than half of the comedians have it as a top word, exclude it from the list
add_stop_words = [word for word, count in Counter(words).most_common() if count > 6]
add_stop_words

In [28]:
# Let's update our document-term matrix with the new list of stop words
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

# Read in cleaned data
data_clean = pd.read_pickle('data_clean.pkl')

# Add new stop words
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_clean.transcript)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = data_clean.index

# Pickle it for later use
import pickle
pickle.dump(cv, open("cv_stop.pkl", "wb"))
data_stop.to_pickle("dtm_stop.pkl")



In [29]:
# Let's make some word clouds!
# Terminal / Anaconda Prompt: conda install -c conda-forge wordcloud
from wordcloud import WordCloud

wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)

### Analysis

In [None]:
# Find the number of unique words that each comedian uses

# Identify the non-zero items in the document-term matrix, meaning that the word occurs at least once
unique_list = []
for comedian in data.columns:
    uniques = data[comedian].to_numpy().nonzero()[0].size
    unique_list.append(uniques)

# Create a new dataframe that contains this unique word count
data_words = pd.DataFrame(list(zip(full_names, unique_list)), columns=['comedian', 'unique_words'])
data_unique_sort = data_words.sort_values(by='unique_words')
data_unique_sort

In [None]:
# Calculate the words per minute of each comedian

# Find the total number of words that a comedian uses
total_list = []
for comedian in data.columns:
    totals = sum(data[comedian])
    total_list.append(totals)
    
# Comedy special run times from IMDB, in minutes
run_times = [60, 59, 80, 60, 67, 73, 77, 63, 62, 58, 76, 79]

# Let's add some columns to our dataframe
data_words['total_words'] = total_list
data_words['run_times'] = run_times
data_words['words_per_minute'] = data_words['total_words'] / data_words['run_times']

# Sort the dataframe by words per minute to see who talks the slowest and fastest
data_wpm_sort = data_words.sort_values(by='words_per_minute')
data_wpm_sort

In [None]:
# Let's plot our findings
import numpy as np

y_pos = np.arange(len(data_words))

plt.subplot(1, 2, 1)
plt.barh(y_pos, data_unique_sort.unique_words, align='center')
plt.yticks(y_pos, data_unique_sort.comedian)
plt.title('Number of Unique Words', fontsize=20)

plt.subplot(1, 2, 2)
plt.barh(y_pos, data_wpm_sort.words_per_minute, align='center')
plt.yticks(y_pos, data_wpm_sort.comedian)
plt.title('Number of Words Per Minute', fontsize=20)

plt.tight_layout()
plt.show()

In [None]:
# Earlier I said we'd revisit profanity. Let's take a look at the most common words again.
Counter(words).most_common()

In [None]:
# Let's isolate just these bad words
data_bad_words = data.transpose()[['fucking', 'fuck', 'shit']]
data_profanity = pd.concat([data_bad_words.fucking + data_bad_words.fuck, data_bad_words.shit], axis=1)
data_profanity.columns = ['f_word', 's_word']
data_profanity

##ASSIGNMENT -4

##Sentiment Analysis

## Introduction

So far, all of the analysis we've done has been pretty generic - looking at counts, creating scatter plots, etc. These techniques could be applied to numeric data as well.

When it comes to text data, there are a few popular techniques that we'll be going through in the next few notebooks, starting with sentiment analysis. A few key points to remember with sentiment analysis.

1. **TextBlob Module:** Linguistic researchers have labeled the sentiment of words based on their domain expertise. Sentiment of words can vary based on where it is in a sentence. The TextBlob module allows us to take advantage of these labels.
2. **Sentiment Labels:** Each word in a corpus is labeled in terms of polarity and subjectivity (there are more labels as well, but we're going to ignore them for now). A corpus' sentiment is the average of these.
   * **Polarity**: How positive or negative a word is. -1 is very negative. +1 is very positive.
   * **Subjectivity**: How subjective, or opinionated a word is. 0 is fact. +1 is very much an opinion.

For more info on how TextBlob coded up its [sentiment function](https://planspace.org/20150607-textblob_sentiment/).

Let's take a look at the sentiment of the various transcripts, both overall and throughout the comedy routine.

## Sentiment of Routine

In [None]:
# We'll start by reading in the corpus, which preserves word order
import pandas as pd

data = pd.read_pickle('corpus.pkl')
data

In [None]:
# Create quick lambda functions to find the polarity and subjectivity of each routine
# Terminal / Anaconda Navigator: conda install -c conda-forge textblob
from textblob import TextBlob

pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

data['polarity'] = data['transcript'].apply(pol)
data['subjectivity'] = data['transcript'].apply(sub)
data

In [None]:
# Let's plot the results
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [10, 8]

for index, comedian in enumerate(data.index):
    x = data.polarity.loc[comedian]
    y = data.subjectivity.loc[comedian]
    plt.scatter(x, y, color='blue')
    plt.text(x+.001, y+.001, data['full_name'][index], fontsize=10)
    plt.xlim(-.01, .12) 
    
plt.title('Sentiment Analysis', fontsize=20)
plt.xlabel('<-- Negative -------- Positive -->', fontsize=15)
plt.ylabel('<-- Facts -------- Opinions -->', fontsize=15)

plt.show()

## Sentiment of Routine Over Time

Instead of looking at the overall sentiment, let's see if there's anything interesting about the sentiment over time throughout each routine.

In [41]:
# Split each routine into 10 parts
import numpy as np
import math

def split_text(text, n=10):
    '''Takes in a string of text and splits into n equal parts, with a default of 10 equal parts.'''

    # Calculate length of text, the size of each chunk of text and the starting points of each chunk of text
    length = len(text)
    size = math.floor(length / n)
    start = np.arange(0, length, size)
    
    # Pull out equally sized pieces of text and put it into a list
    split_list = []
    for piece in range(n):
        split_list.append(text[start[piece]:start[piece]+size])
    return split_list

In [None]:
# Let's take a look at our data again
data

In [None]:
# Let's create a list to hold all of the pieces of text
list_pieces = []
for t in data.transcript:
    split = split_text(t)
    list_pieces.append(split)
    
list_pieces

In [44]:
# The list has 10 elements, one for each transcript
len(list_pieces)

12

In [None]:
# Calculate the polarity for each piece of text

polarity_transcript = []
for lp in list_pieces:
    polarity_piece = []
    for p in lp:
        polarity_piece.append(TextBlob(p).sentiment.polarity)
    polarity_transcript.append(polarity_piece)
    
polarity_transcript

##ASSIGNMENT - 5

# Topic Modeling

## Introduction

Another popular text analysis technique is called topic modeling. The ultimate goal of topic modeling is to find various topics that are present in your corpus. Each document in the corpus will be made up of at least one topic, if not multiple topics.

In this notebook, we will be covering the steps on how to do **Latent Dirichlet Allocation (LDA)**, which is one of many topic modeling techniques. It was specifically designed for text data.

To use a topic modeling technique, you need to provide (1) a document-term matrix and (2) the number of topics you would like the algorithm to pick up.

Once the topic modeling technique is applied, your job as a human is to interpret the results and see if the mix of words in each topic make sense. If they don't make sense, you can try changing up the number of topics, the terms in the document-term matrix, model parameters, or even try a different model.

## Topic Modeling - Attempt #1 (All Text)

In [37]:
# Let's read in our document-term matrix
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data

Unnamed: 0,aaaaaaah,aaaaaah,aaah,aah,aarp,abandon,abandoned,abc,abercrombie,ability,...,zhoosh,zip,zippers,zoned,zones,zoom,zoomed,álvarez,ándale,ñañaras
bill,0,0,0,0,1,0,0,0,0,1,...,0,0,0,1,0,2,0,0,0,0
dave,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
deon,0,0,0,0,0,0,1,0,0,0,...,0,1,1,0,0,0,0,0,0,0
fortune,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
gabriel,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,4,0,3,1,1
iliza,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,2,0,0,0,0
kate,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mike,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
neal,0,0,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
patton,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,2,0,0,0,0


In [38]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models
import scipy.sparse

# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [39]:
# One of the required inputs is a term-document matrix
tdm = data.transpose()
tdm.head()

Unnamed: 0,bill,dave,deon,fortune,gabriel,iliza,kate,mike,neal,patton,trevor,whitney
aaaaaaah,0,0,0,0,0,0,0,1,0,0,0,0
aaaaaah,0,0,0,0,0,0,0,0,0,0,0,1
aaah,0,0,0,0,1,0,0,0,0,0,0,1
aah,0,1,0,0,0,0,1,1,3,0,0,0
aarp,1,0,0,0,0,0,0,0,0,0,0,0


In [40]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [41]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term), we need to specify two other parameters - the number of topics and the number of passes. Let's start the number of topics at 2, see if the results make sense, and increase the number from there.

In [42]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.005*"think" + 0.005*"said" + 0.005*"want" + 0.005*"day" + 0.005*"hes" + 0.005*"time" + 0.004*"did" + 0.004*"ill" + 0.004*"uh" + 0.004*"say"'),
 (1,
  '0.010*"shit" + 0.006*"fuckin" + 0.005*"look" + 0.005*"want" + 0.005*"man" + 0.005*"did" + 0.005*"cause" + 0.005*"women" + 0.005*"said" + 0.005*"time"')]

In [43]:
# LDA for num_topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.015*"shit" + 0.010*"ngga" + 0.008*"man" + 0.007*"women" + 0.007*"aint" + 0.007*"want" + 0.007*"look" + 0.006*"come" + 0.006*"girl" + 0.005*"motherfucker"'),
 (1,
  '0.007*"hes" + 0.007*"said" + 0.006*"time" + 0.006*"think" + 0.006*"day" + 0.005*"did" + 0.005*"thing" + 0.005*"cause" + 0.005*"didnt" + 0.005*"want"'),
 (2,
  '0.007*"fuckin" + 0.006*"did" + 0.006*"think" + 0.005*"shit" + 0.005*"cause" + 0.005*"theyre" + 0.005*"say" + 0.005*"said" + 0.005*"good" + 0.005*"want"')]

In [44]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.001*"women" + 0.000*"want" + 0.000*"good" + 0.000*"theres" + 0.000*"think" + 0.000*"did" + 0.000*"look" + 0.000*"shes" + 0.000*"say" + 0.000*"time"'),
 (1,
  '0.012*"fuckin" + 0.007*"did" + 0.006*"shit" + 0.006*"good" + 0.006*"want" + 0.005*"think" + 0.005*"cause" + 0.005*"say" + 0.005*"time" + 0.005*"said"'),
 (2,
  '0.017*"shit" + 0.010*"ngga" + 0.009*"man" + 0.007*"aint" + 0.005*"come" + 0.005*"women" + 0.005*"yall" + 0.005*"fuck" + 0.005*"look" + 0.005*"said"'),
 (3,
  '0.007*"hes" + 0.006*"want" + 0.005*"said" + 0.005*"think" + 0.005*"time" + 0.005*"did" + 0.005*"day" + 0.005*"look" + 0.005*"little" + 0.004*"didnt"')]

These topics aren't looking too great. We've tried modifying our parameters. Let's try modifying our terms list as well.

## Topic Modeling - Attempt #2 (Nouns Only)

One popular trick is to look only at terms that are from one part of speech (only nouns, only adjectives, etc.). Check out the UPenn tag set: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html.

In [45]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [46]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,transcript
bill,ladies and gentlemen bill burr all right thank you thank you very much thank you thank you thank you how are ya hows it goin all right you guys...
dave,original air date november ladies and gentlemen dave chappelle ♪♪ ♪♪ ♪♪ thank you thank you very much for being here before i start tonight ...
deon,oh this water is so good i dont know why i was so thirsty but anyway i feel comfortable now it feels real good in here yeah its so good to see y...
fortune,please welcome fortune feimster ♪ im a powerful woman ♪ ♪ always get what i want ♪ ♪ so dont you get in my way now thats not what i want ♪ ♪ ca...
gabriel,can you please state your name martin moreno but you might know me as martinnnnn ive been touring with gabriel iglesias for years martinnnnn and...
iliza,cleveland ohio thank you thank you so much this is so great this is so nice to be here with you in public were not stuck at home doing this for ...
kate,whoa okay yeah good okay dont embarrass yourself okay ohh the expectations crushing i would argue absolutely crushing debilitating in every way im...
mike,hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo if youre born and raised in the d make some noise and give it up from hollywoo...
neal,all right let me explain friend of mine former friend well call her is an artist right and the theme of our friendship is kind of feeling alon...
patton,hello denver oh my god hello thank you thank you thank you thank you oh my god yes thank you all so much for coming out tonight um hey i brok...


In [47]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [48]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns

Unnamed: 0,transcript
bill,ladies gentlemen bill thank hows sit see year half man state level level man everybody theyre goin face pajamas shit man i people bein i bunch lie...
dave,air date november ladies gentlemen ♪♪ ♪♪ ♪♪ thank tonight i statement i denounce antisemitism forms friends community kanye time years career i wo...
deon,water i i netflix cole seminar cole cole seminar people school gon peanut butter relationships sex ill feeling thank man appreciate yall man mothe...
fortune,fortune feimster ♪ woman ♪ ♪ i ♪ ♪ way i ♪ ♪ cause woman ♪ ♪ i ♪ ♪ way i ♪ ♪ cause woman ♪ ♪ i ♪ ♪ way i ♪ ♪ cause woman ♪ ♪ woman man stop chicag...
gabriel,state name martin moreno martinnnnn gabriel iglesias years hes name years dude thats marriages win journey garages clubs rooms theaters arenas wor...
iliza,cleveland home likes thats blight history one people key freedom i need book ill likes money tiktok youre smokin girls skill dudes job chloe futur...
kate,whoa okay okay dont embarrass expectations i way im gon cause ill kate kate right i know comedian uh stand cameras night camera camera huh moment ...
mike,hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo youre d noise hollywood hood brother — mike epps lets cabbage cabbage patch papa c...
neal,friend friend artist right theme friendship kind world right show feeling i script i hey backdrop i days alienation grade speech therapists office...
patton,hello denver god hey i foot yeah year i foot parts part curb i part part part i everythings fatal i i twenties propellers bactine i pine cone spin...


In [49]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn



Unnamed: 0,aaaaaah,aaah,aah,abc,ability,abortion,abuse,ac,academy,accent,...,youth,youve,zah,zaras,zeke,zhoosh,zippers,zones,zoom,álvarez
bill,0,0,0,0,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
dave,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
deon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
fortune,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
gabriel,0,1,0,1,0,0,0,1,6,0,...,0,2,0,0,0,0,0,1,3,3
iliza,0,0,0,0,0,0,0,0,0,0,...,0,6,0,0,1,0,0,0,2,0
kate,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
mike,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
neal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
patton,0,0,0,0,0,0,0,1,0,0,...,1,4,0,0,0,1,0,0,2,0


In [50]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [51]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.011*"shit" + 0.008*"man" + 0.008*"fuck" + 0.007*"gon" + 0.007*"thing" + 0.007*"way" + 0.007*"hes" + 0.006*"cause" + 0.006*"day" + 0.006*"women"'),
 (1,
  '0.010*"man" + 0.008*"day" + 0.008*"women" + 0.008*"cause" + 0.007*"hes" + 0.007*"way" + 0.006*"thing" + 0.006*"life" + 0.006*"gon" + 0.006*"okay"')]

In [52]:
# Let's try topics = 3
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.011*"women" + 0.008*"cause" + 0.007*"way" + 0.007*"man" + 0.007*"kids" + 0.007*"thing" + 0.007*"gon" + 0.007*"shit" + 0.007*"shes" + 0.007*"guy"'),
 (1,
  '0.013*"day" + 0.011*"hes" + 0.008*"life" + 0.008*"thing" + 0.008*"okay" + 0.007*"man" + 0.007*"way" + 0.006*"gon" + 0.006*"curry" + 0.005*"cause"'),
 (2,
  '0.012*"man" + 0.011*"shit" + 0.007*"hes" + 0.007*"way" + 0.007*"gon" + 0.006*"cause" + 0.006*"day" + 0.006*"ngga" + 0.006*"motherfucker" + 0.006*"thing"')]

In [53]:
# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.017*"day" + 0.014*"hes" + 0.011*"thing" + 0.009*"curry" + 0.009*"man" + 0.008*"life" + 0.008*"order" + 0.006*"friends" + 0.006*"news" + 0.006*"food"'),
 (1,
  '0.013*"shit" + 0.013*"man" + 0.009*"cause" + 0.009*"women" + 0.008*"thing" + 0.008*"way" + 0.007*"gon" + 0.007*"fuck" + 0.007*"kids" + 0.006*"hes"'),
 (2,
  '0.009*"jax" + 0.006*"room" + 0.006*"cause" + 0.006*"car" + 0.006*"lot" + 0.005*"man" + 0.005*"picture" + 0.005*"guy" + 0.005*"shes" + 0.004*"home"'),
 (3,
  '0.010*"okay" + 0.009*"women" + 0.008*"gon" + 0.008*"way" + 0.007*"girl" + 0.007*"hes" + 0.006*"shes" + 0.006*"day" + 0.006*"life" + 0.006*"work"')]

## Topic Modeling - Attempt #3 (Nouns and Adjectives)

In [54]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [55]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,transcript
bill,ladies gentlemen bill right thank ya hows sit see im funny ive year half nice denver man legal state level federal level man everybody theyre goin...
dave,original air date november ladies gentlemen ♪♪ ♪♪ ♪♪ thank i tonight i brief statement i i denounce antisemitism forms i friends jewish community ...
deon,water good i i thirsty i comfortable real good good netflix cole seminar cole cole seminar people school gon peanut butter relationships sex ill y...
fortune,welcome fortune feimster ♪ powerful woman ♪ ♪ i ♪ ♪ way i ♪ ♪ cause powerful woman ♪ ♪ i ♪ ♪ way i ♪ ♪ cause powerful woman ♪ ♪ i ♪ ♪ way i ♪ ♪ ca...
gabriel,state name martin moreno martinnnnn gabriel iglesias years yeah hes name years martinnnnn dude thats most marriages win incredible journey garages...
iliza,cleveland great nice public home ten likes thats real blight american history one people i key financial freedom i need book ill ten likes money t...
kate,whoa okay good okay dont embarrass expectations i way im gon cause thin ill kate kate right i know comedian uh mic stand cameras big night big cam...
mike,hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo hoo youre d noise hollywood hood brother — mike epps lets right cabbage cabbage patch ...
neal,right friend mine former friend artist right theme friendship kind world right i show feeling i script i hey backdrop i days alienation more secon...
patton,hello denver god god much tonight hey i foot yeah i year i foot parts second part i curb i wrong second part first important part i crucial part i...


In [64]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna



Unnamed: 0,aaaaaaah,aaaaaah,aaah,aah,aarp,abandoned,abc,abercrombie,ability,able,...,youve,zah,zaras,zeke,zenlike,zhoosh,zippers,zones,zoom,álvarez
bill,0,0,0,0,1,0,0,0,1,1,...,1,0,0,0,1,0,0,0,2,0
dave,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
deon,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
fortune,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,1,0
gabriel,0,0,1,0,0,0,1,0,0,0,...,2,0,0,0,0,0,0,1,3,3
iliza,0,0,0,0,0,0,0,2,0,2,...,7,0,0,1,0,0,0,0,2,0
kate,0,0,0,1,0,0,0,0,0,2,...,1,0,1,0,0,0,0,0,0,0
mike,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
neal,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
patton,0,0,0,0,0,0,0,0,0,2,...,4,0,0,0,0,1,0,0,2,0


In [65]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [66]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.013*"fuckin" + 0.008*"fuck" + 0.004*"guys" + 0.003*"wrong" + 0.003*"wife" + 0.003*"girls" + 0.003*"bra" + 0.003*"weird" + 0.002*"dad" + 0.002*"pubes"'),
 (1,
  '0.007*"ngga" + 0.004*"motherfucker" + 0.004*"bitch" + 0.004*"ass" + 0.003*"indian" + 0.003*"mom" + 0.003*"guys" + 0.003*"fuck" + 0.003*"yall" + 0.003*"fact"')]

In [67]:
# Let's try 3 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.014*"fuckin" + 0.008*"fuck" + 0.004*"guys" + 0.003*"sex" + 0.003*"kid" + 0.003*"dude" + 0.003*"wife" + 0.003*"girls" + 0.003*"idea" + 0.003*"weird"'),
 (1,
  '0.010*"ngga" + 0.006*"motherfucker" + 0.005*"ass" + 0.005*"bitch" + 0.005*"fuck" + 0.004*"dog" + 0.004*"yall" + 0.004*"guys" + 0.004*"dogs" + 0.003*"ha"'),
 (2,
  '0.010*"indian" + 0.007*"curry" + 0.005*"order" + 0.004*"normal" + 0.004*"news" + 0.004*"trevor" + 0.003*"family" + 0.003*"food" + 0.003*"german" + 0.003*"moment"')]

In [60]:
# Let's try 4 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.014*"fuckin" + 0.007*"ngga" + 0.005*"fuck" + 0.003*"fact" + 0.003*"wife" + 0.003*"story" + 0.003*"guys" + 0.003*"dogs" + 0.003*"ass" + 0.003*"dog"'),
 (1,
  '0.006*"fuck" + 0.005*"pubes" + 0.004*"wrong" + 0.004*"liberal" + 0.003*"guys" + 0.003*"barn" + 0.003*"sex" + 0.003*"weird" + 0.003*"relationships" + 0.003*"privilege"'),
 (2,
  '0.009*"indian" + 0.006*"curry" + 0.004*"order" + 0.004*"normal" + 0.004*"news" + 0.004*"kid" + 0.004*"guys" + 0.003*"sex" + 0.003*"trevor" + 0.003*"mom"'),
 (3,
  '0.012*"motherfucker" + 0.010*"bitch" + 0.008*"fuck" + 0.007*"ngga" + 0.006*"bra" + 0.005*"girls" + 0.005*"ass" + 0.005*"bed" + 0.004*"older" + 0.004*"young"')]

## Identify Topics in Each Document

Out of the 9 topic models we looked at, the nouns and adjectives, 4 topic one made the most sense. So let's pull that down here and run it through some more iterations to get more fine-tuned topics.

In [68]:
# Our final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=80)
ldana.print_topics()

[(0,
  '0.012*"indian" + 0.008*"curry" + 0.006*"order" + 0.005*"normal" + 0.005*"trevor" + 0.004*"news" + 0.004*"food" + 0.004*"family" + 0.004*"german" + 0.004*"moment"'),
 (1,
  '0.010*"ngga" + 0.007*"motherfucker" + 0.006*"bitch" + 0.005*"ass" + 0.005*"girls" + 0.004*"fuck" + 0.004*"yall" + 0.003*"fact" + 0.003*"dog" + 0.003*"aint"'),
 (2,
  '0.021*"fuckin" + 0.009*"fuck" + 0.005*"guys" + 0.005*"dude" + 0.005*"kid" + 0.004*"jax" + 0.003*"minutes" + 0.003*"lesbian" + 0.003*"wife" + 0.003*"somebody"'),
 (3,
  '0.006*"fuck" + 0.005*"pubes" + 0.004*"wrong" + 0.004*"liberal" + 0.003*"guys" + 0.003*"barn" + 0.003*"relationships" + 0.003*"weird" + 0.003*"sex" + 0.003*"privilege"')]

These four topics look pretty decent. Let's settle on these for now.
* Topic 0: mom, parents
* Topic 1: husband, wife
* Topic 2: guns
* Topic 3: profanity

In [69]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

[(2, 'bill'),
 (1, 'dave'),
 (1, 'deon'),
 (2, 'fortune'),
 (1, 'gabriel'),
 (1, 'iliza'),
 (0, 'kate'),
 (1, 'mike'),
 (3, 'neal'),
 (3, 'patton'),
 (0, 'trevor'),
 (2, 'whitney')]

For a first pass of LDA, these kind of make sense to me, so we'll call it a day for now.
* Topic 0: mom, parents [Anthony, Hasan, Louis, Ricky]
* Topic 1: husband, wife [Ali, John, Mike]
* Topic 2: guns [Bill, Bo, Jim]
* Topic 3: profanity [Dave, Joe]

### Assignment:
1. Try further modifying the parameters of the topic models above and see if you can get better topics.
2. Create a new topic model that includes terms from a different [part of speech](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) and see if you can get better topics.

Passes is the number of times you want to go through the entire corpus.  it will output the most probable words that appear in each topic. 

In [70]:
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=50)
ldana.print_topics()

[(0,
  '0.005*"kid" + 0.004*"stuff" + 0.004*"sex" + 0.004*"guys" + 0.004*"dude" + 0.004*"older" + 0.004*"younger" + 0.004*"boobs" + 0.003*"course" + 0.003*"trouble"'),
 (1,
  '0.007*"motherfucker" + 0.005*"bitch" + 0.005*"fuck" + 0.005*"dog" + 0.004*"ass" + 0.004*"dogs" + 0.004*"guys" + 0.004*"ngga" + 0.003*"fact" + 0.003*"ha"'),
 (2,
  '0.019*"fuckin" + 0.009*"fuck" + 0.007*"indian" + 0.005*"curry" + 0.004*"pubes" + 0.003*"order" + 0.003*"wife" + 0.003*"dad" + 0.003*"normal" + 0.003*"news"'),
 (3,
  '0.013*"ngga" + 0.009*"girls" + 0.006*"bra" + 0.005*"motherfcker" + 0.005*"btch" + 0.004*"bed" + 0.004*"hoo" + 0.004*"btches" + 0.004*"tiktok" + 0.004*"yall"')]

In [71]:
# Our final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=100)
ldana.print_topics()

[(0,
  '0.026*"fuckin" + 0.016*"fuck" + 0.011*"motherfucker" + 0.008*"bitch" + 0.007*"ngga" + 0.005*"ass" + 0.005*"guys" + 0.004*"somebody" + 0.004*"dog" + 0.004*"goddamn"'),
 (1,
  '0.005*"guys" + 0.003*"dogs" + 0.003*"girls" + 0.003*"sex" + 0.003*"fuck" + 0.003*"bra" + 0.003*"fact" + 0.003*"story" + 0.003*"wrong" + 0.003*"mom"'),
 (2,
  '0.013*"ngga" + 0.010*"indian" + 0.007*"curry" + 0.005*"order" + 0.005*"motherfcker" + 0.005*"black" + 0.005*"btch" + 0.004*"normal" + 0.004*"news" + 0.004*"trevor"'),
 (3,
  '0.006*"jax" + 0.005*"pubes" + 0.004*"wife" + 0.004*"picture" + 0.003*"fuck" + 0.003*"barn" + 0.003*"ice" + 0.003*"weird" + 0.003*"sex" + 0.003*"line"')]

##INFERENCE

##ADVERB

In [72]:
# Let's create a function to pull out adverbs from a string of text
def adverb(text):
    '''Given a string of text, tokenize the text and pull out only the adverb.'''
    is_adverb = lambda pos: pos[:2] == 'RB'
    tokenized = word_tokenize(text)
    adverb = [word for (word, pos) in pos_tag(tokenized) if is_adverb(pos)] 
    return ' '.join(adverb)

In [73]:
# Apply the adverbs function to the transcripts to filter only on adverbs
data_adverb = pd.DataFrame(data_clean.transcript.apply(adverb))
data_adverb

Unnamed: 0,transcript
bill,very much all right inside just here not here inside so inside so just totally cover then nose then america america well i never never never so ta...
dave,very much here just probably now early never together never normally immediately first just just even vaguely im then just up all so not just arou...
deon,so so anyway now here so na na else out tonight much lately even too always instead na yeah ill id now youre too still here really then really jus...
fortune,always so dont now not always so dont now not always so dont now not always here right south just just yall not not not not very literally actuall...
gabriel,martinnnnn better now no longer so ever so else ♪♪ now ya never i didnt so so very here already so just finally right antonio now ever here only t...
iliza,so much so so here not right just never ugly only now oh never now here never never very just youd never just always well not side well so come no...
kate,yeah absolutely so so then youll so so really so everywhere inherently not actually inherently yes right yeah very even absolutely very just i act...
mike,right right here here i right here right here here right here right here always yall up right here just here nggas here again again totally brothe...
neal,well alone then less i so im smoothly just not away up technically more well start right so never right so back so maybe just never then i not nev...
patton,yes so most once i now dont then all right almost so i well probably all much once even then then not just even down down most still just actually...


In [74]:
# Create a new document-term matrix using only adverbs, also remove common words with max_df
cvn = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvn = cvn.fit_transform(data_adverb.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_adverb.index
data_dtmn



Unnamed: 0,aah,absolutely,accidentally,actively,activism,actually,adequately,adulthood,agh,ago,...,yakyakyak,yall,yard,yay,yeeroh,yell,yes,youd,youll,youve
bill,0,3,1,0,0,7,0,0,0,0,...,1,0,0,0,0,1,3,3,0,1
dave,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
deon,0,0,0,0,0,0,0,0,0,0,...,0,5,0,0,0,0,2,0,3,0
fortune,0,3,0,0,0,6,0,0,1,0,...,0,2,1,0,0,0,1,1,1,0
gabriel,0,1,0,0,0,3,0,0,0,5,...,0,0,0,1,2,0,2,0,0,1
iliza,0,0,1,0,0,2,1,0,0,2,...,0,0,0,0,0,0,3,1,2,8
kate,0,7,0,1,1,33,0,1,0,1,...,0,0,0,0,0,0,1,0,1,0
mike,0,0,0,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,1
neal,1,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,2,1,1
patton,0,1,0,0,0,8,0,0,0,1,...,0,0,0,0,0,0,3,1,0,0


In [75]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [76]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.065*"actually" + 0.028*"na" + 0.026*"finally" + 0.023*"ta" + 0.022*"okay" + 0.017*"immediately" + 0.017*"long" + 0.016*"absolutely" + 0.014*"yes" + 0.013*"exactly"'),
 (1,
  '0.037*"pretty" + 0.028*"na" + 0.028*"ngga" + 0.026*"ta" + 0.021*"yall" + 0.016*"ill" + 0.016*"lonely" + 0.015*"totally" + 0.012*"long" + 0.011*"silly"')]

In [77]:
# Let's start with 3 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.073*"actually" + 0.028*"okay" + 0.024*"ta" + 0.023*"immediately" + 0.020*"finally" + 0.019*"absolutely" + 0.018*"longer" + 0.017*"youve" + 0.016*"literally" + 0.016*"forward"'),
 (1,
  '0.039*"pretty" + 0.031*"na" + 0.028*"ngga" + 0.026*"ta" + 0.026*"yall" + 0.020*"finally" + 0.016*"ill" + 0.016*"lonely" + 0.015*"actually" + 0.013*"youll"'),
 (2,
  '0.049*"na" + 0.034*"actually" + 0.031*"totally" + 0.022*"finally" + 0.022*"ta" + 0.020*"far" + 0.019*"youd" + 0.019*"long" + 0.017*"pretty" + 0.017*"exactly"')]

In [78]:
# Let's start with 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.085*"actually" + 0.038*"finally" + 0.023*"na" + 0.021*"absolutely" + 0.019*"long" + 0.018*"okay" + 0.017*"immediately" + 0.016*"totally" + 0.016*"basically" + 0.016*"forward"'),
 (1,
  '0.052*"ngga" + 0.038*"ta" + 0.038*"na" + 0.038*"yall" + 0.029*"lonely" + 0.024*"ill" + 0.020*"pretty" + 0.020*"silly" + 0.020*"alright" + 0.015*"youll"'),
 (2,
  '0.040*"pretty" + 0.039*"okay" + 0.036*"youve" + 0.020*"constantly" + 0.013*"actually" + 0.013*"youd" + 0.013*"ago" + 0.013*"eventually" + 0.013*"yes" + 0.013*"forward"'),
 (3,
  '0.043*"ta" + 0.043*"actually" + 0.041*"na" + 0.020*"youd" + 0.020*"immediately" + 0.019*"long" + 0.019*"literally" + 0.017*"pretty" + 0.017*"yes" + 0.017*"anymore"')]

In [79]:
#increase no of passes
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=50)
ldan.print_topics()

[(0,
  '0.066*"pretty" + 0.026*"ngga" + 0.016*"eventually" + 0.016*"ill" + 0.016*"yall" + 0.016*"basically" + 0.011*"completely" + 0.011*"nggas" + 0.011*"youd" + 0.011*"recently"'),
 (1,
  '0.054*"na" + 0.050*"ta" + 0.043*"actually" + 0.027*"finally" + 0.023*"long" + 0.019*"far" + 0.018*"totally" + 0.018*"ill" + 0.017*"hard" + 0.017*"yes"'),
 (2,
  '0.119*"actually" + 0.045*"okay" + 0.028*"youve" + 0.024*"absolutely" + 0.018*"constantly" + 0.018*"instead" + 0.018*"immediately" + 0.014*"recently" + 0.014*"yes" + 0.014*"finally"'),
 (3,
  '0.026*"basically" + 0.023*"forward" + 0.023*"longer" + 0.022*"finally" + 0.021*"ago" + 0.020*"immediately" + 0.018*"recently" + 0.017*"long" + 0.016*"actually" + 0.015*"theyd"')]

In [80]:
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=80)
ldan.print_topics()

[(0,
  '0.057*"ta" + 0.050*"na" + 0.027*"ngga" + 0.027*"lonely" + 0.023*"ill" + 0.023*"yall" + 0.019*"yes" + 0.018*"silly" + 0.018*"alright" + 0.015*"close"'),
 (1,
  '0.167*"actually" + 0.036*"absolutely" + 0.026*"immediately" + 0.021*"recently" + 0.021*"okay" + 0.016*"instead" + 0.016*"truly" + 0.011*"usually" + 0.011*"close" + 0.011*"ill"'),
 (2,
  '0.030*"finally" + 0.030*"actually" + 0.027*"youve" + 0.027*"okay" + 0.024*"na" + 0.019*"constantly" + 0.017*"youd" + 0.017*"mentally" + 0.014*"theyd" + 0.014*"totally"'),
 (3,
  '0.033*"actually" + 0.030*"na" + 0.030*"pretty" + 0.030*"ta" + 0.025*"finally" + 0.021*"long" + 0.018*"basically" + 0.018*"inside" + 0.018*"totally" + 0.018*"far"')]

In [123]:
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=100)
ldan.print_topics()

[(0,
  '0.044*"finally" + 0.038*"actually" + 0.026*"mentally" + 0.026*"hard" + 0.026*"na" + 0.020*"absolutely" + 0.014*"possibly" + 0.014*"exactly" + 0.014*"yall" + 0.014*"long"'),
 (1,
  '0.051*"okay" + 0.045*"youve" + 0.029*"constantly" + 0.018*"yes" + 0.018*"tight" + 0.012*"ago" + 0.012*"tonight" + 0.012*"youll" + 0.012*"barely" + 0.012*"instead"'),
 (2,
  '0.030*"na" + 0.027*"pretty" + 0.022*"finally" + 0.021*"long" + 0.021*"ta" + 0.019*"totally" + 0.019*"basically" + 0.018*"ngga" + 0.016*"actually" + 0.015*"forward"'),
 (3,
  '0.109*"actually" + 0.046*"ta" + 0.032*"na" + 0.025*"absolutely" + 0.025*"immediately" + 0.019*"literally" + 0.019*"ill" + 0.019*"okay" + 0.016*"yes" + 0.014*"close"')]

In [124]:
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=200)
ldan.print_topics()

[(0,
  '0.032*"finally" + 0.029*"ta" + 0.028*"actually" + 0.027*"longer" + 0.022*"long" + 0.021*"na" + 0.020*"inside" + 0.020*"literally" + 0.020*"immediately" + 0.018*"forward"'),
 (1,
  '0.109*"actually" + 0.042*"ta" + 0.038*"na" + 0.023*"absolutely" + 0.020*"ill" + 0.018*"yall" + 0.016*"yes" + 0.016*"immediately" + 0.015*"close" + 0.015*"lonely"'),
 (2,
  '0.036*"okay" + 0.033*"pretty" + 0.027*"totally" + 0.027*"youve" + 0.027*"na" + 0.021*"finally" + 0.018*"actually" + 0.015*"constantly" + 0.015*"barely" + 0.015*"exactly"'),
 (3,
  '0.024*"pretty" + 0.024*"ngga" + 0.024*"na" + 0.020*"youd" + 0.020*"recently" + 0.020*"totally" + 0.020*"long" + 0.015*"theyd" + 0.015*"anymore" + 0.015*"jelly"')]

##VERB

In [82]:
# Let's create a function to pull out verbs from a string of text
def verb(text):
    '''Given a string of text, tokenize the text and pull out only the verb.'''
    is_verb = lambda pos: pos[:2] == 'VB'
    tokenized = word_tokenize(text)
    verb = [word for (word, pos) in pos_tag(tokenized) if is_verb(pos)] 
    return ' '.join(verb)

In [83]:
# Apply the verbs function to the transcripts to filter only on verbs
data_verb = pd.DataFrame(data_clean.transcript.apply(verb))
data_verb

Unnamed: 0,transcript
bill,burr thank thank thank are goin guys standin sit been right be dressing hiking guys have guys have guys been has been enjoyed been staying are bei...
dave,dave chappelle thank being start wanted read prepared stand is buy got tell guys been doing learned are say are ive heard do said gotten pull was ...
deon,is dont know was feel feels yeah see babe are ive been thinking are doing is gon be blooded did is gon be blooded taught know taught blooded taugh...
fortune,please im get want get thats want im get want get thats want im get want get thats want im get i ♪ oh stop whats going being has transpired has de...
gabriel,please know ive been touring been screaming hurry thats has been weve gone living say is selling thats subjective expect los has seen do make stic...
iliza,thank thank is is be were stuck doing ah danced know did was wants talk danced danced do enough dont read do i hope works were gon make dancing wa...
kate,okay crushing argue crushing debilitating move im disappear be is is is dont watch is is yeah theres theres know see see right filming filming are...
mike,born raised make give go c fcking stinky stinky stinky doug whats e look tonight damn get slapped nggas are smell gunpowder had come detroit do is...
neal,let explain call is is feeling wrote is sent was make shes got sends feels am supposed arrange going be talking guys going be preoccupied trying s...
patton,hello thank thank thank oh coming broke thats started broke was took happened was slipped landed was was turned is turn get was walk put was falls...


In [84]:
# Create a new document-term matrix using only adverbs, also remove common words with max_df
cvn2 = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvn2 = cvn2.fit_transform(data_verb.transcript)
data_dtmn2 = pd.DataFrame(data_cvn2.toarray(), columns=cvn2.get_feature_names())
data_dtmn2.index = data_verb.index
data_dtmn2



Unnamed: 0,aah,abandon,absorb,accent,accept,accepted,accepts,accomplished,according,accounted,...,youll,youve,zara,zero,zip,zoned,zoom,zoomed,ándale,ñañaras
bill,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
dave,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
deon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
fortune,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
gabriel,0,0,0,0,1,1,0,0,1,0,...,0,1,0,0,0,0,1,0,1,1
iliza,0,0,0,0,1,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
kate,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
mike,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
neal,2,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
patton,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [85]:
# Create the gensim corpus
corpusn2 = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn2.transpose()))

# Create the vocabulary dictionary
id2wordn2 = dict((v, k) for k, v in cvn2.vocabulary_.items())

In [86]:
# Let's start with 2 topics
ldan2 = models.LdaModel(corpus=corpusn2, num_topics=2, id2word=id2wordn2, passes=10)
ldan2.print_topics()

[(0,
  '0.009*"fucking" + 0.009*"guys" + 0.009*"fuckin" + 0.007*"comes" + 0.006*"knew" + 0.006*"shit" + 0.005*"used" + 0.005*"looked" + 0.005*"walking" + 0.005*"dating"'),
 (1,
  '0.013*"shit" + 0.009*"aint" + 0.006*"guys" + 0.006*"understand" + 0.006*"fucking" + 0.005*"thank" + 0.005*"looks" + 0.005*"fuck" + 0.005*"comes" + 0.004*"wan"')]

In [87]:
# Let's start with 3 topics
ldan2 = models.LdaModel(corpus=corpusn2, num_topics=3, id2word=id2wordn2, passes=10)
ldan2.print_topics()

[(0,
  '0.020*"shit" + 0.012*"aint" + 0.012*"fucking" + 0.008*"fuck" + 0.006*"used" + 0.006*"guys" + 0.005*"understand" + 0.005*"comes" + 0.005*"shes" + 0.004*"thank"'),
 (1,
  '0.010*"guys" + 0.008*"comes" + 0.007*"understand" + 0.006*"knew" + 0.006*"happen" + 0.005*"fuckin" + 0.005*"wan" + 0.005*"walking" + 0.005*"looks" + 0.005*"heres"'),
 (2,
  '0.007*"believe" + 0.006*"crying" + 0.005*"seeing" + 0.005*"imagine" + 0.005*"born" + 0.003*"ohh" + 0.003*"fear" + 0.003*"hollywood" + 0.003*"admit" + 0.003*"tonight"')]

In [88]:
# Let's start with 4 topics
ldan2 = models.LdaModel(corpus=corpusn2, num_topics=4, id2word=id2wordn2, passes=10)
ldan2.print_topics()

[(0,
  '0.026*"shit" + 0.015*"fucking" + 0.014*"aint" + 0.012*"fuck" + 0.011*"used" + 0.007*"guys" + 0.006*"wan" + 0.006*"thank" + 0.005*"dating" + 0.005*"broke"'),
 (1,
  '0.009*"understand" + 0.007*"guys" + 0.007*"comes" + 0.007*"wants" + 0.006*"gone" + 0.006*"looks" + 0.006*"hed" + 0.005*"heres" + 0.005*"knows" + 0.005*"happen"'),
 (2,
  '0.012*"fuckin" + 0.009*"walking" + 0.006*"comes" + 0.006*"knew" + 0.006*"believe" + 0.006*"looked" + 0.006*"fucked" + 0.006*"fucking" + 0.005*"shes" + 0.005*"sit"'),
 (3,
  '0.010*"aint" + 0.009*"shit" + 0.009*"guys" + 0.008*"married" + 0.006*"motherfcking" + 0.006*"fcking" + 0.005*"pull" + 0.005*"fucking" + 0.005*"comes" + 0.004*"taking"')]

In [89]:
#increase no of passes
ldan2 = models.LdaModel(corpus=corpusn2, num_topics=4, id2word=id2wordn2, passes=50)
ldan2.print_topics()

[(0,
  '0.012*"guys" + 0.007*"heard" + 0.007*"shit" + 0.006*"comes" + 0.006*"fucking" + 0.006*"understand" + 0.005*"heres" + 0.005*"knows" + 0.005*"thank" + 0.005*"knew"'),
 (1,
  '0.032*"shit" + 0.017*"aint" + 0.013*"used" + 0.011*"fucking" + 0.010*"fuck" + 0.007*"dating" + 0.007*"understand" + 0.007*"guys" + 0.006*"knew" + 0.006*"catch"'),
 (2,
  '0.009*"aint" + 0.008*"hed" + 0.007*"shit" + 0.006*"believe" + 0.006*"motherfcking" + 0.006*"fcking" + 0.006*"understand" + 0.006*"tried" + 0.005*"crying" + 0.005*"open"'),
 (3,
  '0.009*"fucking" + 0.008*"comes" + 0.007*"guys" + 0.007*"thank" + 0.007*"shes" + 0.007*"fuckin" + 0.007*"walking" + 0.006*"wan" + 0.006*"looked" + 0.006*"having"')]

In [90]:
ldan2 = models.LdaModel(corpus=corpusn2, num_topics=4, id2word=id2wordn2, passes=80)
ldan2.print_topics()

[(0,
  '0.012*"fucking" + 0.011*"used" + 0.008*"fuck" + 0.008*"guys" + 0.007*"broke" + 0.007*"dating" + 0.006*"sell" + 0.005*"wait" + 0.005*"play" + 0.005*"thank"'),
 (1,
  '0.010*"fuckin" + 0.009*"fucking" + 0.008*"believe" + 0.008*"guys" + 0.007*"walking" + 0.007*"comes" + 0.006*"looked" + 0.006*"knew" + 0.006*"shit" + 0.006*"fucked"'),
 (2,
  '0.023*"shit" + 0.016*"aint" + 0.009*"understand" + 0.007*"hed" + 0.006*"fuck" + 0.006*"fucking" + 0.006*"lost" + 0.005*"eat" + 0.005*"having" + 0.005*"wants"'),
 (3,
  '0.012*"guys" + 0.007*"comes" + 0.007*"thank" + 0.006*"heard" + 0.005*"knows" + 0.005*"looks" + 0.005*"heres" + 0.005*"understand" + 0.005*"knew" + 0.005*"gone"')]

In [127]:
ldan2 = models.LdaModel(corpus=corpusn2, num_topics=3, id2word=id2wordn2, passes=300)
ldan2.print_topics()

[(0,
  '0.011*"fucking" + 0.008*"guys" + 0.007*"fuckin" + 0.006*"knew" + 0.006*"used" + 0.006*"comes" + 0.006*"shit" + 0.005*"believe" + 0.005*"looked" + 0.005*"fuck"'),
 (1,
  '0.021*"shit" + 0.013*"aint" + 0.008*"understand" + 0.007*"guys" + 0.006*"hed" + 0.005*"lost" + 0.005*"happen" + 0.005*"gave" + 0.005*"gone" + 0.005*"heard"'),
 (2,
  '0.008*"wants" + 0.007*"comes" + 0.006*"having" + 0.006*"looks" + 0.006*"guys" + 0.005*"stand" + 0.005*"shes" + 0.005*"boys" + 0.005*"married" + 0.005*"thank"')]

##Topics
0 - Profanity 

1 - Understanding or Believes

2 - Come or Go 




##INFERENCE

##ASSIGNMENT - 6

# Text Generation

Markov chains can be used for very basic text generation. Think about every word in a corpus as a state. We can make a simple assumption that the next word is only dependent on the previous word - which is the basic assumption of a Markov chain.

Markov chains don't generate text as well as deep learning, but it's a good (and fun!) start.

## Select Text to Imitate

In this notebook, we're specifically going to generate text in the style of Ali Wong, so as a first step, let's extract the text from her comedy routine.

In [92]:
# Read in the corpus, including punctuation!
import pandas as pd

data = pd.read_pickle('corpus.pkl')
data

Unnamed: 0,transcript,full_name
bill,"[audience cheering, whistling] [emcee] Ladies and gentlemen, Bill Burr! [cheering and applause] All right, thank you! Thank you very much. Thank y...",Mike Epps
dave,"Original air date: November 12, 2022 * * * Ladies and gentlemen, Dave Chappelle. [Cheers and applause] ♪♪ ♪♪ ♪♪ Thank you. Thank you very much for...",Dave Chappelle
deon,"[indistinct chattering] [woman] Oh, this water is so good. I don’t know why I was so thirsty. But anyway, I feel comfortable now. It feels real go...",Iliza Shlesinger
fortune,[upbeat music plays] [audience cheering] [announcer] Please welcome Fortune Feimster! ♪ I’m a powerful woman ♪ ♪ Always get what I want ♪ ♪ So don...,Gabriel Iglesias
gabriel,[man] Can you please state your name? Martin Moreno. But you might know me as… Martinnnnn! I’ve been touring with Gabriel Iglesias for 20-plus yea...,Fortune Feimster
iliza,"[upbeat music playing] [crowd cheering] Cleveland, Ohio! Thank you! Thank you so much. This is so great. This is so nice to be here with you in pu...",Deon Cole
kate,"Whoa! Okay, yeah. Good. Okay, don’t embarrass yourself, okay. Ohh, the expectations. Crushing, I would argue. Absolutely crushing, debilitating in...",Neal Brennan
mike,[Cheers and applause] Hoo! Hoo! Hoo! Hoo! Hoo! Hoo! Hoo! [Cheers and applause intensify] Hoo! Hoo! Hoo! Hoo! Hoo! Hoo! Hoo! Hoo! Hoo! [Beatboxing]...,Trevor Noah
neal,"[gentle music playing] [audience applauding] [audience cheering] All right, let me explain. Friend of mine… “Former friend,” we’ll call her. [audi...",Whitney Cummings
patton,[“Hard to Kill” by Bleached plays] [audience cheering and applauding] Hello! Denver! [cheering and applause continues] [Patton laughs] Oh my God! ...,Kate Berlant


In [93]:
# Extract only Ali Wong's text
ali_text = data.transcript.loc['kate']
ali_text[:200]

'Whoa! Okay, yeah. Good. Okay, don’t embarrass yourself, okay. Ohh, the expectations. Crushing, I would argue. Absolutely crushing, debilitating in every way. I’m gonna move this, ’cause I’m so thin I’'

## Build a Markov Chain Function

We are going to build a simple Markov chain function that creates a dictionary:
* The keys should be all of the words in the corpus
* The values should be a list of the words that follow the keys

In [94]:
from collections import defaultdict

def markov_chain(text):
    '''The input is a string of text and the output will be a dictionary with each word as
       a key and each value as the list of words that come after the key in the text.'''
    
    # Tokenize the text by word, though including punctuation
    words = text.split(' ')
    
    # Initialize a default dictionary to hold all of the words and next words
    m_dict = defaultdict(list)
    
    # Create a zipped list of all of the word pairs and put them in word: list of next words format
    for current_word, next_word in zip(words[0:-1], words[1:]):
        m_dict[current_word].append(next_word)

    # Convert the default dict back into a dictionary
    m_dict = dict(m_dict)
    return m_dict

In [95]:
# Create the dictionary for Ali's routine, take a look at it
ali_dict = markov_chain(ali_text)
ali_dict

{'Whoa!': ['Okay,', 'And'],
 'Okay,': ['yeah.',
  'don’t',
  'it’s',
  'ten',
  'and…',
  'so',
  'so',
  'I’m',
  'great.',
  'did',
  'um…',
  'okay!',
  'wow.',
  'we’ll,',
  'okay,',
  'sorry,',
  'got',
  'whoa,',
  'should'],
 'yeah.': ['Good.',
  'Very',
  'So',
  'But…',
  'I',
  'Hope',
  'That’s',
  'Okay.',
  'Okay,',
  'I’m',
  'Were',
  'I’ve',
  '40',
  'That’s',
  'Oh,',
  'Okay.'],
 'Good.': ['Okay,'],
 'don’t': ['embarrass',
  'know',
  'know',
  'know,',
  'know',
  'even',
  'wanna',
  'explore.',
  'know.',
  'know',
  'know.',
  'know.”',
  'know.',
  'know.',
  'have',
  'see',
  'see',
  'know',
  'want',
  'show',
  'let',
  'you',
  'know,',
  'have',
  'know.',
  'know.',
  'have',
  'worry.',
  'forget',
  'want',
  'know…'],
 'embarrass': ['yourself,'],
 'yourself,': ['okay.'],
 'okay.': ['Ohh,',
  'I’m',
  'I’m',
  'Back',
  'You’re',
  'When',
  'I',
  '“Okay,',
  'Ohh!'],
 'Ohh,': ['the', 'I', 'how'],
 'the': ['expectations.',
  'comedian,',
  'mic',
  'c

## Create a Text Generator

We're going to create a function that generates sentences. It will take two things as inputs:
* The dictionary you just created
* The number of words you want generated

Here are some examples of generated sentences:

>'Shape right turn– I also takes so that she’s got women all know that snail-trail.'

>'Optimum level of early retirement, and be sure all the following Tuesday… because it’s too.'

In [96]:
import random

def generate_sentence(chain, count=15):
    '''Input a dictionary in the format of key = current word, value = list of next words
       along with the number of words you would like to see in your generated sentence.'''

    # Capitalize the first word
    word1 = random.choice(list(chain.keys()))
    sentence = word1.capitalize()

    # Generate the second word from the value list. Set the new word as the first word. Repeat.
    for i in range(count-1):
        word2 = random.choice(chain[word1])
        word1 = word2
        sentence += ' ' + word2
        if(word2[-1]=='?' or word2[-1]=='.' or word2[-1]=='!'):
          break

    # End it with a period
    sentence += '.'
    return(sentence)

In [97]:
generate_sentence(ali_dict)

'Do all been incredible. I’ve felt in the mic stand?” So… Uh, so, in San.'

### Assignment:
1. Generate sentence for other comedians also.
2. Try making the generate_sentence function better. Maybe allow it to end with a random punctuation mark or end whenever it gets to a word that already ends with a punctuation mark.

##KATE

In [98]:
# Extract only Kate's text
kate_text = data.transcript.loc['kate']
kate_text[:200]

'Whoa! Okay, yeah. Good. Okay, don’t embarrass yourself, okay. Ohh, the expectations. Crushing, I would argue. Absolutely crushing, debilitating in every way. I’m gonna move this, ’cause I’m so thin I’'

In [99]:
from collections import defaultdict

def markov_chain(text):
    '''The input is a string of text and the output will be a dictionary with each word as
       a key and each value as the list of words that come after the key in the text.'''
    
    # Tokenize the text by word, though including punctuation
    words = text.split(' ')
    
    # Initialize a default dictionary to hold all of the words and next words
    m_dict = defaultdict(list)
    
    # Create a zipped list of all of the word pairs and put them in word: list of next words format
    for current_word, next_word in zip(words[0:-1], words[1:]):
        m_dict[current_word].append(next_word)

    # Convert the default dict back into a dictionary
    m_dict = dict(m_dict)
    return m_dict

In [100]:
# Create the dictionary for kate's routine, take a look at it
kate_dict = markov_chain(kate_text)
kate_dict

{'Whoa!': ['Okay,', 'And'],
 'Okay,': ['yeah.',
  'don’t',
  'it’s',
  'ten',
  'and…',
  'so',
  'so',
  'I’m',
  'great.',
  'did',
  'um…',
  'okay!',
  'wow.',
  'we’ll,',
  'okay,',
  'sorry,',
  'got',
  'whoa,',
  'should'],
 'yeah.': ['Good.',
  'Very',
  'So',
  'But…',
  'I',
  'Hope',
  'That’s',
  'Okay.',
  'Okay,',
  'I’m',
  'Were',
  'I’ve',
  '40',
  'That’s',
  'Oh,',
  'Okay.'],
 'Good.': ['Okay,'],
 'don’t': ['embarrass',
  'know',
  'know',
  'know,',
  'know',
  'even',
  'wanna',
  'explore.',
  'know.',
  'know',
  'know.',
  'know.”',
  'know.',
  'know.',
  'have',
  'see',
  'see',
  'know',
  'want',
  'show',
  'let',
  'you',
  'know,',
  'have',
  'know.',
  'know.',
  'have',
  'worry.',
  'forget',
  'want',
  'know…'],
 'embarrass': ['yourself,'],
 'yourself,': ['okay.'],
 'okay.': ['Ohh,',
  'I’m',
  'I’m',
  'Back',
  'You’re',
  'When',
  'I',
  '“Okay,',
  'Ohh!'],
 'Ohh,': ['the', 'I', 'how'],
 'the': ['expectations.',
  'comedian,',
  'mic',
  'c

In [101]:
generate_sentence(kate_dict)

'Filming, so you’re saying yes. “Okay, say or is me help you. Okay, ten years.'

In [102]:
import random

def generate_sentence(chain, count=15):
    '''Input a dictionary in the format of key = current word, value = list of next words
       along with the number of words you would like to see in your generated sentence.'''

    # Capitalize the first word
    word1 = random.choice(list(chain.keys()))
    sentence = word1.capitalize()

    # Generate the second word from the value list. Set the new word as the first word. Repeat.
    for i in range(count-1):
        word2 = random.choice(chain[word1])
        word1 = word2
        sentence += ' ' + word2
        if(word2[-1]=='?' or word2[-1]=='.' or word2[-1]=='!'):
          break


    # End it with a period
    sentence += random.choice(['!',';',':','.',',','^','*','(',')','=','+','?'])
    return(sentence)

In [103]:
generate_sentence(kate_dict)

'Emotion? I kind of ankle. Unprofession… Oh, mys… Yeah, all siblings, yeah. Good. Okay, should('

##BILL

In [104]:
# Extract only Bill's text
bill_text = data.transcript.loc['bill']
bill_text[:200]

'[audience cheering, whistling] [emcee] Ladies and gentlemen, Bill Burr! [cheering and applause] All right, thank you! Thank you very much. Thank you, thank you, thank you. How are ya? How’s it goin’? '

In [105]:
from collections import defaultdict

def markov_chain(text):
    '''The input is a string of text and the output will be a dictionary with each word as
       a key and each value as the list of words that come after the key in the text.'''
    
    # Tokenize the text by word, though including punctuation
    words = text.split(' ')
    
    # Initialize a default dictionary to hold all of the words and next words
    m_dict = defaultdict(list)
    
    # Create a zipped list of all of the word pairs and put them in word: list of next words format
    for current_word, next_word in zip(words[0:-1], words[1:]):
        m_dict[current_word].append(next_word)

    # Convert the default dict back into a dictionary
    m_dict = dict(m_dict)
    return m_dict

In [106]:
# Create the dictionary for bill's routine, take a look at it
bill_dict = markov_chain(bill_text)
bill_dict

{'[audience': ['cheering,', 'cheering,', 'cheering]', 'cheering]'],
 'cheering,': ['whistling]', 'you’re', 'whistling]'],
 'whistling]': ['[emcee]', 'This', 'Thanks'],
 '[emcee]': ['Ladies', 'Bill'],
 'Ladies': ['and'],
 'and': ['gentlemen,',
  'applause]',
  'a',
  'all',
  'you',
  'hold',
  'be',
  'all',
  'all',
  'the',
  'try',
  'they’re',
  'you',
  'I',
  'then,',
  'then',
  'I',
  'I',
  'shakes',
  'rubs',
  'they',
  'it',
  'then',
  'there',
  'cancel',
  'they',
  'then',
  'a',
  'one',
  'if',
  'ride',
  'then',
  '’50s?',
  'shit',
  'they',
  'this',
  'she',
  'I',
  'takes',
  'women',
  'she',
  'gone.',
  'they',
  'make',
  'say',
  'she',
  'act',
  'you’re',
  'night',
  'purses',
  'shoes',
  'Botox.',
  'try',
  'you',
  'I',
  'says',
  'looked',
  'thought',
  'is',
  'they',
  'there’s',
  'two',
  'that',
  'then',
  'shit.',
  'then',
  'that',
  'you’re',
  'you',
  'you',
  'fall',
  'I’m',
  'I’m',
  'then',
  'over',
  'gettin’',
  'gettin’',
  '

In [107]:
generate_sentence(bill_dict)

'Buy your back seat like, first hairy-leg white people, I’m just get out and I,'

In [108]:
import random

def generate_sentence(chain, count=15):
    '''Input a dictionary in the format of key = current word, value = list of next words
       along with the number of words you would like to see in your generated sentence.'''

    # Capitalize the first word
    word1 = random.choice(list(chain.keys()))
    sentence = word1.capitalize()

    # Generate the second word from the value list. Set the new word as the first word. Repeat.
    for i in range(count-1):
        word2 = random.choice(chain[word1])
        word1 = word2
        sentence += ' ' + word2
        if(word2[-1]=='?' or word2[-1]=='.' or word2[-1]=='!'):
          break


    # End it with a period
    sentence += random.choice(['!',';',':','.',',','^','*','(',')','=','+','?'])
    return(sentence)

In [109]:
generate_sentence(bill_dict)

'Yanking down the right, so stupid fireman getup, and we can come into your life?^'

#DAVE

In [110]:
# Extract only Dave's text
dave_text = data.transcript.loc['dave']
dave_text[:200]

'Original air date: November 12, 2022 * * * Ladies and gentlemen, Dave Chappelle. [Cheers and applause] ♪♪ ♪♪ ♪♪ Thank you. Thank you very much for being here. Before I start tonight, I just wanted to '

In [111]:
from collections import defaultdict

def markov_chain(text):
    '''The input is a string of text and the output will be a dictionary with each word as
       a key and each value as the list of words that come after the key in the text.'''
    
    # Tokenize the text by word, though including punctuation
    words = text.split(' ')
    
    # Initialize a default dictionary to hold all of the words and next words
    m_dict = defaultdict(list)
    
    # Create a zipped list of all of the word pairs and put them in word: list of next words format
    for current_word, next_word in zip(words[0:-1], words[1:]):
        m_dict[current_word].append(next_word)

    # Convert the default dict back into a dictionary
    m_dict = dict(m_dict)
    return m_dict

In [112]:
# Create the dictionary for dav's routine, take a look at it
dave_dict = markov_chain(dave_text)
dave_dict

{'Original': ['air'],
 'air': ['date:'],
 'date:': ['November'],
 'November': ['12,'],
 '12,': ['2022'],
 '2022': ['*'],
 '*': ['*', '*', 'Ladies', 'Name*'],
 'Ladies': ['and'],
 'and': ['gentlemen,',
  'applause]',
  'applause]',
  'those',
  '“Jews.”',
  'it',
  'them',
  'they',
  'stuff',
  'I',
  'funny.',
  'Adidas',
  'they',
  'you',
  'he',
  'longer.',
  'y’all',
  'you',
  'it’s',
  'applause]',
  'Barack',
  'chopped',
  'did',
  'your',
  'tell',
  'started',
  'I’m',
  'I',
  'applause]',
  'applause]',
  'touching',
  'everybody’s',
  'applause]',
  'a',
  'a',
  'I',
  'applause]',
  'I',
  'applause]',
  'applause]',
  'why',
  'where',
  'the',
  'much'],
 'gentlemen,': ['Dave'],
 'Dave': ['Chappelle.', 'Chappelle'],
 'Chappelle.': ['[Cheers'],
 '[Cheers': ['and', 'and', 'and', 'and', 'and', 'and', 'and', 'and'],
 'applause]': ['♪♪',
  'I',
  'A',
  'Even',
  'No',
  'War',
  'They',
  'Nobody',
  'It',
  'Brooklyn’s',
  '♪♪'],
 '♪♪': ['♪♪', '♪♪', 'Thank', '\xa0'],
 '

In [113]:
generate_sentence(dave_dict)

'War. Even the office. Staplers, computer mouses, all them were like, “God damn!” [Laughter] How!'

In [114]:
import random

def generate_sentence(chain, count=15):
    '''Input a dictionary in the format of key = current word, value = list of next words
       along with the number of words you would like to see in your generated sentence.'''

    # Capitalize the first word
    word1 = random.choice(list(chain.keys()))
    sentence = word1.capitalize()

    # Generate the second word from the value list. Set the new word as the first word. Repeat.
    for i in range(count-1):
        word2 = random.choice(chain[word1])
        word1 = word2
        sentence += ' ' + word2
        if(word2[-1]=='?' or word2[-1]=='.' or word2[-1]=='!'):
          break


    # End it with a period
    sentence += random.choice(['!',';',':','.',',','^','*','(',')','=','+','?'])
    return(sentence)

In [115]:
generate_sentence(dave_dict)

'There looking at his rise to say together in their money on their money on*'

#DEON

In [116]:
# Extract only Deon's text
deon_text = data.transcript.loc['deon']
deon_text[:200]

'[indistinct chattering] [woman] Oh, this water is so good. I don’t know why I was so thirsty. But anyway, I feel comfortable now. It feels real good in here. Yeah. It’s so good to see you, babe. What '

In [117]:
from collections import defaultdict

def markov_chain(text):
    '''The input is a string of text and the output will be a dictionary with each word as
       a key and each value as the list of words that come after the key in the text.'''
    
    # Tokenize the text by word, though including punctuation
    words = text.split(' ')
    
    # Initialize a default dictionary to hold all of the words and next words
    m_dict = defaultdict(list)
    
    # Create a zipped list of all of the word pairs and put them in word: list of next words format
    for current_word, next_word in zip(words[0:-1], words[1:]):
        m_dict[current_word].append(next_word)

    # Convert the default dict back into a dictionary
    m_dict = dict(m_dict)
    return m_dict

In [118]:
# Create the dictionary for deon's routine, take a look at it
deon_dict = markov_chain(deon_text)
deon_dict

{'[indistinct': ['chattering]', 'chattering', 'chattering', 'chattering'],
 'chattering]': ['[woman]'],
 '[woman]': ['Oh,'],
 'Oh,': ['this', 'here’s', 'okay', 'man.'],
 'this': ['water',
  'special',
  'much,',
  'technology,',
  'thang',
  'is.',
  'far.',
  'motherfucker',
  'is',
  'U-shaped',
  'sucking',
  'motherfucker.',
  'for',
  'little',
  'old',
  'old',
  'motherfucker’s',
  'house,',
  'shit…',
  'house',
  'shit.',
  'shit.',
  'shit',
  'shit',
  'motherfucker,',
  'call,',
  'slab',
  'in',
  'is',
  'white',
  'is',
  'protein',
  'bitch',
  'wonderful',
  'girl',
  'is',
  'motherfucker',
  'bitch',
  'shit.',
  'so',
  'corner',
  'corner',
  'part',
  'here,',
  'for',
  'shit',
  'past',
  'day.',
  'day.',
  'fucking',
  'to'],
 'water': ['is', 'descend', 'we', 'on', 'be'],
 'is': ['so',
  'still',
  'right',
  'serious,',
  '50',
  'like',
  'just',
  'what',
  'gay.”',
  'gay,”',
  'you',
  'it',
  'in',
  'it',
  'a',
  'how',
  'still',
  'the',
  'that?”',


In [119]:
generate_sentence(deon_dict)

'Fondued out. Any toilet we go ahead. Get your arms and turn into a picture,'

In [120]:
import random

def generate_sentence(chain, count=15):
    '''Input a dictionary in the format of key = current word, value = list of next words
       along with the number of words you would like to see in your generated sentence.'''

    # Capitalize the first word
    word1 = random.choice(list(chain.keys()))
    sentence = word1.capitalize()

    # Generate the second word from the value list. Set the new word as the first word. Repeat.
    for i in range(count-1):
        word2 = random.choice(chain[word1])
        word1 = word2
        sentence += ' ' + word2
       # if(word2[-1]=='?' or word2[-1]=='.' or word2[-1]=='!'):
       #  break


    # End it with a period
    sentence += random.choice(['!',';',':','.',',','^','*','(',')','=','+','?'])
    return(sentence)

In [121]:
generate_sentence(deon_dict)

'Towels hanging up… [crowd laughing] It feels real quick. [crowd laughing] All her book down?'