In [None]:
import pandas as pd
import requests
import json
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk import download

download('punkt')
download('stopwords')

url = "https://raw.githubusercontent.com/marobinette/pocs/main/project/comedy_data.txt"

response = requests.get(url)
data = response.text.strip().splitlines()
comedy_data = [json.loads(line) for line in data]
df = pd.DataFrame(comedy_data, columns=['title', 'transcript'])
pd.set_option('display.max_colwidth', None)
print(df['title'])

def export_comedian_word_data(word_data, comedian):
    """
    Exports the word data for a specific comedian to a CSV file.
    
    Parameters:
    - word_data (DataFrame): DataFrame containing words, counts, total unique words, and probabilities.
    - comedian (str): Name of the comedian to label the CSV file.
    """
    word_data = word_data.rename(columns={'word': 'types'})
    word_data = word_data[['types', 'counts', 'total_unique', 'probs']]    
    filename = f"{comedian}.csv"
    word_data.to_csv(filename, index=False)
    print(f"Data exported to {filename} successfully.")

def get_word_data(words):
    """
    Generates word frequency data including counts and probabilities.
    
    Parameters:
    - words (list): List of words to analyze for frequency and probability.
    
    Returns:
    - DataFrame: DataFrame with word counts, probabilities, and total unique word count.
    """
    word_data = pd.Series(words).value_counts().reset_index()
    word_data.columns = ['word', 'counts']        
    word_data['probs'] = word_data['counts'] / word_data['counts'].sum()    
    word_data['total_unique'] = len(word_data)
    word_data = word_data.sort_values(by='counts', ascending=False).reset_index(drop=True)    
    return word_data

def tokenize_words(text, remove_stop_wprds = False):
    """
    Tokenizes the text into words, with an option to remove stopwords.
    
    Parameters:
    - text (str or list): Text or list of strings to tokenize.
    - remove_stop_wprds (bool): Whether to remove stopwords from the tokens.
    
    Returns:
    - list: List of tokenized words.
    """
    if isinstance(text, list):
        text = ' '.join(text)
    
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)  

    if remove_stop_wprds:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]
    
    return words

eddie = df['transcript'][1]
eddie_words = tokenize_words(eddie, True)
eddie_word_data = get_word_data(eddie_words)
export_comedian_word_data(eddie_word_data, 'eddie_murphy')

dave_chapelle = df['transcript'][2]
dave_chapelle_words = tokenize_words(dave_chapelle, True)
dave_chapelle_word_data = get_word_data(dave_chapelle_words)
export_comedian_word_data(dave_chapelle_word_data, 'dave_chapelle')
