In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re


def download_nltk_resources():
    try:
        nltk.download("stopwords", quiet=True)
        nltk.download("wordnet", quiet=True)
    except Exception as e:
        print(f"Error downloading NLTK resources: {e}")


download_nltk_resources()

In [2]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def normalize_transcript(transcript: str) -> str:
    transcript = re.sub(r"[^\w\s]|(?<!\d)\d(?!\d)", "", transcript)
    transcript = re.sub(r"\s+", " ", transcript).lower()
    transcript = " ".join(
        lemmatizer.lemmatize(word)
        for word in transcript.split()
        if word not in stop_words
    )
    transcript = re.sub(r"\s+", " ", transcript).strip()
    return transcript

In [3]:
df = pd.read_csv("data/transcripts_new.csv", header=0)
df.dropna(subset=["transcript"], inplace=True)
df["transcript"] = df["transcript"].apply(normalize_transcript)
df.dropna(subset=["transcript"], inplace=True)
df.head()

Unnamed: 0,video_id,transcript
0,AAHiZ-c88ec,man im getting frustrated water heater deliver...
1,AEsRr-ZnzNc,start give shoutout rokukun vyonder harry stac...
2,AJpzk-aFZPU,test number alright okay oh go ground oh god c...
3,AONXX-h9SdI,gonna play something brand new album came octo...
4,AOZIY-AwsjM,music


In [4]:
df.to_csv("data/transcripts_normalized.csv", index=False)

In [5]:
df["word_count"] = df["transcript"].apply(lambda x: len(x.split()))
df["word_count"].describe()

count     2654.000000
mean       504.842502
std       1340.897095
min          0.000000
25%         23.000000
50%         99.000000
75%        403.750000
max      21641.000000
Name: word_count, dtype: float64