In [12]:
# Import necessary libraries
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from textblob import TextBlob
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
import random
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/simonbeckmann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonbeckmann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load the Corpus of Comedy Scripts

In [13]:
# Load the corpus of comedy scripts from a pickle file
with open('comedy_scripts.pkl', 'rb') as f:
    scripts = pickle.load(f)

# Create a DataFrame for easier manipulation
df = pd.DataFrame(scripts, columns=['comedian', 'script'])
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'comedy_scripts.pkl'

### Sentiment Analysis Using TextBlob

In [None]:
# Function to calculate polarity and subjectivity
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

# Apply the function to each script
df[['polarity', 'subjectivity']] = df['script'].apply(lambda x: pd.Series(get_sentiment(x)))
df.head()

In [None]:
# Plot polarity vs subjectivity
plt.figure(figsize=(10, 6))
plt.scatter(df['polarity'], df['subjectivity'], alpha=0.5)
plt.title('Polarity vs Subjectivity of Comedians')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.grid(True)
plt.show()

### Analyze Sentiment Over Time

In [None]:
# Function to split text into parts
def split_text(text, n_parts):
    words = text.split()
    length = len(words)
    parts = []
    for i in range(n_parts):
        start = int(i * length / n_parts)
        end = int((i + 1) * length / n_parts)
        part = ' '.join(words[start:end])
        parts.append(part)
    return parts

n_parts = 12
sentiment_over_time = defaultdict(list)

for index, row in df.iterrows():
    parts = split_text(row['script'], n_parts)
    for part in parts:
        polarity, subjectivity = get_sentiment(part)
        sentiment_over_time[row['comedian']].append(polarity)

In [None]:
# Plot sentiment over time for each comedian
plt.figure(figsize=(12, 8))
for comedian, sentiments in sentiment_over_time.items():
    plt.plot(range(n_parts), sentiments, label=comedian)
plt.title('Sentiment Over Time for Each Comedian')
plt.xlabel('Script Part')
plt.ylabel('Polarity')
plt.legend()
plt.grid(True)
plt.show()

### Topic Modeling Using Gensim LDA

In [None]:
# Preprocess the scripts
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

df['tokens'] = df['script'].apply(preprocess)
df.head()

In [None]:
# Create dictionary and corpus for LDA
dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

# Build LDA model
lda_model = models.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)

In [None]:
# Print the topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

In [None]:
# Assign topics to each script
df['topics'] = [lda_model.get_document_topics(bow) for bow in corpus]
df.head()

### Text Generation Using Markov Chains

In [None]:
# Choose a comedian to generate text for
comedian_name = 'Ali Wong'  # Change to the comedian you want
comedian_scripts = df[df['comedian'] == comedian_name]['script'].str.cat(sep=' ')

# Build the Markov Chain
def build_markov_chain(text):
    words = text.split()
    m_chain = defaultdict(list)
    for current_word, next_word in zip(words[:-1], words[1:]):
        m_chain[current_word].append(next_word)
    return dict(m_chain)

markov_chain = build_markov_chain(comedian_scripts)

In [None]:
# Generate text using the Markov Chain
def generate_text(chain, count=50):
    word1 = random.choice(list(chain.keys()))
    sentence = [word1.capitalize()]
    for _ in range(count):
        word1 = random.choice(chain.get(word1, chain.keys()))
        sentence.append(word1)
    return ' '.join(sentence) + '.'

generated_text = generate_text(markov_chain)
print("\nGenerated Text:")
print(generated_text)

### Conclusion

In this notebook, we've performed sentiment analysis, topic modeling, and text generation on a corpus of comedy scripts. We utilized libraries like TextBlob for sentiment analysis, Gensim for topic modeling, and implemented a basic Markov Chain for text generation.