## loading the data

In [7]:
import requests
from bs4 import BeautifulSoup
import pickle


In [2]:
def url_to_transcript(url):
    ''' function to scrape the routine of comedians from  url'''
    source_code = requests.get(url).text
    soup = BeautifulSoup(source_code, 'html.parser')
    transcript = [p.text for p in soup.find(class_="post-content").find_all('p')]
    return transcript

In [3]:
# listing the urls and comedians
urls = ['http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/',
        'http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-2016-full-transcript/']
comedians = ['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe']

In [4]:
# making a directory to store the scripts
import os
if os.path.isdir("transcripts"):
    print("directory already exist")
else:
    os.makedirs('transcripts')

directory already exist


In [8]:
transcripts = [url_to_transcript(u) for u in urls] # all transcripts

In [10]:
# storing each comedian transcript in a txt file for further use
for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

In [8]:
#store the data in a dict
comedians = ['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe']
data = {}
for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

## Cleaning the data

In [11]:
# joining the elements of the list into one single text
data_combined = {}
for key, value in data.items():
    data_combined[key] = [' '.join(value)]

In [12]:
#put it into a pandas dataframe
import pandas as pd

pd.set_option('max_colwidth',150)
data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df.head()

Unnamed: 0,transcript
ali,"Ladies and gentlemen, please welcome to the stage: Ali Wong! Hi. Hello! Welcome! Thank you! Thank you for coming. Hello! Hello. We are gonna have ..."
anthony,"Thank you. Thank you. Thank you, San Francisco. Thank you so much. So good to be here. People were surprised when I told ’em I was gonna tape my s..."
bill,"[cheers and applause] All right, thank you! Thank you very much! Thank you. Thank you. Thank you. How are you? What’s going on? Thank you. It’s a ..."
bo,Bo What? Old MacDonald had a farm E I E I O And on that farm he had a pig E I E I O Here a snort There a Old MacDonald had a farm E I E I O [Appla...
dave,"This is Dave. He tells dirty jokes for a living. That stare is where most of his hard work happens. It signifies a profound train of thought, the ..."


In [18]:
#check the dataframe
#data_df['transcript'].loc['dave']

In [19]:
import re
import string

def clean_data_phase_1(data):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers
        and ♪'''
    data = data.lower()
    data = re.sub('\[.*?\]', '', data)
    data = re.sub('[%s]' % re.escape(string.punctuation), '', data)
    data = re.sub('\w*\d\w*', '', data)
    data = re.sub(' ♪', '', data)
    data = re.sub('[‘’“”…]', '', data)
    data = re.sub('\n', '', data)
    #data = re.sub('\'', '', data)
    return data

round1 = lambda x: clean_data_phase_1(x)

In [20]:
import pandas as pd
# contains bag of words or corpus
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
#checking the clean data
#data_clean['transcript'].loc['dave']

In [21]:
#add the full name of the comedians to clean data
full_names = ['Ali Wong', 'Anthony Jeselnik', 'Bill Burr', 'Bo Burnham', 'Dave Chappelle', 'Hasan Minhaj',
              'Jim Jefferies', 'Joe Rogan', 'John Mulaney', 'Louis C.K.', 'Mike Birbiglia', 'Ricky Gervais']

data_clean['full_name'] = full_names
data_clean.head()

Unnamed: 0,transcript,full_name
ali,ladies and gentlemen please welcome to the stage ali wong hi hello welcome thank you thank you for coming hello hello we are gonna have to get thi...,Ali Wong
anthony,thank you thank you thank you san francisco thank you so much so good to be here people were surprised when i told em i was gonna tape my special ...,Anthony Jeselnik
bill,all right thank you thank you very much thank you thank you thank you how are you whats going on thank you its a pleasure to be here in the great...,Bill Burr
bo,bo what old macdonald had a farm e i e i o and on that farm he had a pig e i e i o here a snort there a old macdonald had a farm e i e i o this i...,Bo Burnham
dave,this is dave he tells dirty jokes for a living that stare is where most of his hard work happens it signifies a profound train of thought the alch...,Dave Chappelle


In [22]:
import os
if os.path.isdir("pickles"):
    print("directory already exist")
else:
    os.makedirs('pickles')

In [24]:
# pickle it for later use
data_df.to_pickle("pickles/corpus.pkl")

In [27]:
#create a document-term matrix using CountVectorizercreate and exclude stop words of English language
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aaah,aah,abc,abcs,ability,abject,...,zee,zen,zeppelin,zero,zillion,zombie,zombies,zoning,zoo,éclair
ali,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
anthony,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bill,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,1,1,1,1,0,0
bo,0,1,1,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
dave,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hasan,0,0,0,0,0,0,0,0,0,0,...,2,1,0,1,0,0,0,0,0,0
jim,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joe,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
john,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
louis,0,0,0,0,0,3,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0


In [28]:
# pickle document-term matrix 
data_dtm.to_pickle("pickles/dtm.pkl")

In [29]:
# pickle data_clean
data_clean.to_pickle("pickles/data_clean.pkl")
pickle.dump(cv, open("pickles/cv.pkl", "wb"))