## Loading Relevant Packages

In [1]:
import pandas as pd

## Loading the dataset

In [2]:
# Loading the dataset
data = pd.read_pickle('corpus.pkl')
data.head()

Unnamed: 0,transcript,full_name
ali,"Ladies and gentlemen, please welcome to the st...",Ali Wong
anthony,"Thank you. Thank you. Thank you, San Francisco...",Anthony Jeselnik
bill,"[cheers and applause] All right, thank you! Th...",Bill Burr
bo,Bo What? Old MacDonald had a farm E I E I O An...,Bo Burnham
dave,This is Dave. He tells dirty jokes for a livin...,Dave Chappelle


In [3]:
# Getting the number of comedian
print(f"The number of comdeians is {len(data.index.unique())}")

The number of comdeians is 12


In [4]:
# Observing the text
bill_burr = data.loc['bill'].transcript
bill_burr[:300]

'[cheers and applause] All right, thank you! Thank you very much! Thank you. Thank you. Thank you. How are you? What’s going on? Thank you. It’s a pleasure to be here in the greater Atlanta, Georgia, area, this oasis. It’s nice to be here. I don’t know why I came here in June. It’s nice to be here. W'

## Data cleaning and preprocessing


In [5]:
import re # This is also a powerful text cleaning library


def text_cleaner(text):
    # Converting to lowercase
    text = text.lower()
    # Removing the texts in square brackets including the bracket
    text = re.sub(r"\[[^]]*\]", "", text)
    # Removing punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Removing numbers
    text = re.sub(r'[0-9]+', '', text)
    return text


In [6]:
bill_burr_cleaned = text_cleaner(bill_burr)
bill_burr_cleaned[:800]

' all right thank you thank you very much thank you thank you thank you how are you whats going on thank you its a pleasure to be here in the greater atlanta georgia area this oasis its nice to be here i dont know why i came here in june its nice to be here wasnt thinking fucking ridiculously hot out there just miserable horrible that kind of heat you understand the racism down here ya know i get it how would you get along with anybody look at em just over there drinkin a cold drink lemonade was made for the white man so what the hell have i been doing with my life trying to get in shape man but i hate going to the gym so i decided id go veggie twice a week its brutal i can only make it till about  five oclock thats what i realized about myself you know that something has to die every day i'

## Building the markov function

In [7]:
from collections import defaultdict

In [8]:
def markov(text):
    # Splitting the text
    words = text.split()
    # Creating a default dictionary that accepts a list as the input
    dicts = defaultdict(list)
    # Creating a dictionary with current word as the key and appending the next word into the values list 
    # With this, all the unique words in the text corpus becomes a key.
    for current_word, next_word in zip(words[:-1], words[1:]):
        dicts[current_word].append(next_word)
    m_dict = dict(dicts)
    return m_dict

In [9]:
bills = markov(bill_burr_cleaned)
bills

{'all': ['right',
  'that',
  'this',
  'these',
  'right',
  'of',
  'kinds',
  'of',
  'of',
  'right',
  'right',
  'you',
  'pissed',
  'you',
  'over',
  'through',
  'right',
  'right',
  'of',
  'comes',
  'right',
  'those',
  'of',
  'empty',
  'right',
  'dead',
  'right',
  'the',
  'makes',
  'of',
  'right',
  'the',
  'of',
  'night',
  'right',
  'my',
  'these',
  'of',
  'right',
  'of',
  'of',
  'of',
  'car',
  'right',
  'the',
  'six',
  'right',
  'great',
  'youre',
  'of',
  'my',
  'right',
  'he',
  'the',
  'right',
  'got',
  'had',
  'of',
  'my',
  'right',
  'right',
  'their',
  'i',
  'right',
  'day',
  'right',
  'night',
  'right',
  'right',
  'right',
  'right',
  'right',
  'over',
  'right',
  'right',
  'you',
  'right',
  'of',
  'over',
  'went',
  'right',
  'right',
  'right',
  'right',
  'i',
  'i',
  'you',
  'you',
  'you',
  'right',
  'right',
  'i',
  'right',
  'right',
  'you',
  'the',
  'of',
  'of',
  'right',
  'right',
  'righ

## Text Generator

In [10]:
import random

def generate_sentence(chain, count=15):
    # Choosing a random key to start with
    word1 = random.choice(list(chain.keys()))
    # Capitalizing the first word
    sentence = word1.capitalize()
    
    for i in range(count - 1):
        # Choosing the next
        word2 = random.choice(chain[word1])
        word1 = word2
        sentence += ' ' + word2
        
    sentence += '.'
    return(sentence)

In [15]:
generate_sentence(bills, 50)

'Flop in the bed grab a fuck out of how i could defend my wifes always just had he wants you fuck you could hear that guy is going to people my headphones dont know i had a sudden people okay so one of couple of the kitchen table i.'

In [16]:
generate_sentence(bills, 50)

'Factory like your mind im like it they wont get along with an original wedding theres candles that shit am immediately the hell she goes whack mine just accidentally did you know when you this dude beating the discovery channel about facebook i figure uh i dont understand that little.'

In [17]:
generate_sentence(bills, 50)

'Whores you gonna do you son is fixed bankers are cool drink lemonade was like psychotic with my story down they still some waitress wrote on the female ant pussy for me i shot the planes all over miles of it okay but i was falling out there you got.'