In [1]:
import pandas as pd
import requests
import json
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk import download

# Download necessary NLTK resources
download('punkt')
download('stopwords')

url = "https://raw.githubusercontent.com/marobinette/pocs/main/project/comedy_data.txt"

# Load data
response = requests.get(url)
data = response.text.strip().splitlines()

# Parse each line of JSON and store in a list
comedy_data = [json.loads(line) for line in data]

# Convert list of dictionaries to DataFrame with only title and transcript
df = pd.DataFrame(comedy_data, columns=['title', 'transcript'])

# Display the DataFrame
pd.set_option('display.max_colwidth', None)
print(df['title'])

def export_comedian_word_data(word_data, comedian):
    # Replace 'Word' column with 'Type'
    word_data = word_data.rename(columns={'word': 'types'})
    
    # Reorder the columns as specified
    word_data = word_data[['types', 'counts', 'total_unique', 'probs']]
    
    # Define the filename with the comedian's name
    filename = f"{comedian}.csv"
    
    # Export to CSV
    word_data.to_csv(filename, index=False)
    
    print(f"Data exported to {filename} successfully.")

def get_word_data(words):
    word_data = pd.Series(words).value_counts().reset_index()
    word_data.columns = ['word', 'counts']
        
    # Calculate probabilities
    word_data['probs'] = word_data['counts'] / word_data['counts'].sum()
    
    # Add a column for total unique words
    word_data['total_unique'] = len(word_data)
    
    # Sort by count in descending order
    word_data = word_data.sort_values(by='counts', ascending=False).reset_index(drop=True)
    
    # Return the word data
    return word_data

def tokenize_words(text, remove_stop_wprds = False):
    # Check if text is a list; if so, join it into a single string
    if isinstance(text, list):
        text = ' '.join(text)
    
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)  

    if remove_stop_wprds:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]
    
    return words

eddie = df['transcript'][1]
eddie_words = tokenize_words(eddie, True)
eddie_word_data = get_word_data(eddie_words)
export_comedian_word_data(eddie_word_data, 'eddie_murphy')

dave_chapelle = df['transcript'][2]
dave_chapelle_words = tokenize_words(dave_chapelle, True)
dave_chapelle_word_data = get_word_data(dave_chapelle_words)
export_comedian_word_data(dave_chapelle_word_data, 'dave_chapelle')


ModuleNotFoundError: No module named 'nltk'