In [3]:
pip install pandas matplotlib nltk webvtt-py

Collecting webvtt-py
  Downloading webvtt_py-0.5.1-py3-none-any.whl.metadata (3.4 kB)
Downloading webvtt_py-0.5.1-py3-none-any.whl (19 kB)
Installing collected packages: webvtt-py
Successfully installed webvtt-py-0.5.1


In [5]:
pip install scikit-learn



In [6]:
import nltk

In [13]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [15]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

A. Parsing Comments into a DataFrame

In [25]:
import pandas as pd
import re

def structure_comments_from_txt(filepath='/content/drive/MyDrive/CSE477/comment.txt'):
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    comments_data = []
    time_regex = re.compile(r'.*(second|minute|hour|day|week|month|year)s? ago.*', re.IGNORECASE)
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if i + 1 < len(lines) and time_regex.match(lines[i+1].strip()):
            username = line
            timestamp = lines[i+1].strip()
            comment_text = []
            i += 2
            while i < len(lines) and not (i+1 < len(lines) and time_regex.match(lines[i+1].strip())):
                comment_line = lines[i].strip()
                if comment_line and not comment_line.lower() in ['reply', '...more']:
                    comment_text.append(comment_line)
                i += 1
            if comment_text:
                comments_data.append({
                    'username': username,
                    'timestamp_text': timestamp,
                    'comment_text': ' '.join(comment_text)
                })
        else:
            i += 1
    return pd.DataFrame(comments_data)

# Run the function and check your results!
comments_df = structure_comments_from_txt()
print(comments_df.head())


                 username        timestamp_text  \
0    @risebyliftingothers  2 years ago (edited)   
1  @auliamardhatillah2240           2 years ago   
2             @limwei2634           2 years ago   
3         @nancykataria08          2 months ago   
4                @jpbaugh           2 years ago   

                                        comment_text  
0  ₹100.00 Thanks for an amazingly simplified app...  
1  Yesterday I click on a video called 'learning ...  
2  I've been trying to learn ML for quite awhile ...  
3  No fancy words, just simple English and the ri...  
4  For anyone getting an error related to convert...  


In [31]:
print(comments_df)

                   username        timestamp_text  \
0      @risebyliftingothers  2 years ago (edited)   
1    @auliamardhatillah2240           2 years ago   
2               @limwei2634           2 years ago   
3           @nancykataria08          2 months ago   
4                  @jpbaugh           2 years ago   
..                      ...                   ...   
195          @rickgomez7480           2 years ago   
196                  @wqwng           2 years ago   
197                @tvdeath           2 years ago   
198          @GrandZangoule            1 year ago   
199      @nqobilelerato5751            1 year ago   

                                          comment_text  
0    ₹100.00 Thanks for an amazingly simplified app...  
1    Yesterday I click on a video called 'learning ...  
2    I've been trying to learn ML for quite awhile ...  
3    No fancy words, just simple English and the ri...  
4    For anyone getting an error related to convert...  
..                   

B. Parsing Captions into a DataFrame

In [34]:
import webvtt
import re
import pandas as pd

def structure_captions_from_vtt(filepath):
    try:
        full_text = ' '.join([caption.text.strip() for caption in webvtt.read(filepath)])
    except Exception as e:
        print(f"Error reading VTT file: {e}. Falling back to manual line reading.")
        captions = []
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if '-->' not in line and line and not line.isdigit() and 'WEBVTT' not in line:
                    captions.append(line)
        full_text = ' '.join(captions)
    sentences = re.split(r'(?<=[.!?]) +', full_text)
    return pd.DataFrame(sentences, columns=['caption_sentence'])

captions_df = structure_captions_from_vtt('/content/drive/MyDrive/CSE477/Caption.vtt')
print(captions_df.head())


                                    caption_sentence
0  kylie ying has worked at many kylie ying has w...
1             so let's actually just accuracy is 81.
2             so let's actually just accuracy is 81.
3  so let's actually just\nmake this five make th...
4                  of people that have covid is 531.


In [36]:
print(captions_df)

                                     caption_sentence
0   kylie ying has worked at many kylie ying has w...
1              so let's actually just accuracy is 81.
2              so let's actually just accuracy is 81.
3   so let's actually just\nmake this five make th...
4                   of people that have covid is 531.
5   of people that have covid is 531.\nso i'm goin...
6                                                  1.
7   1.\nso here let's rewrite this so here let's r...
8                                this over this is 1.
9   this over this is 1.\nso now my probability is...
10                             equal to let's say 32.
11  equal to let's say 32.\nall right um all right...
12             so now it'd be a row of 64 this to 64.
13             so now it'd be a row of 64 this to 64.
14  so now it'd be a row of 64\nnodes and then 32 ...
15            let's say let's keep the epochs at 100.
16  let's say let's keep the epochs at 100.\nand n...
17            actually let's

C. The Cleaning Pipeline

In [38]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [39]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = text.strip()
    return text

def tokenize_text(text):
    return word_tokenize(text)

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words and len(word) > 2]

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

def stem_tokens(tokens):
    return [stemmer.stem(word) for word in tokens]

def clean_text_pipeline(text, use_lemmatization=True):
    normalized = normalize_text(text)
    tokens = tokenize_text(normalized)
    filtered = remove_stopwords(tokens)
    if use_lemmatization:
        final_tokens = lemmatize_tokens(filtered)
    else:
        final_tokens = stem_tokens(filtered)
    return final_tokens

# Example application:
comments_df['cleaned_tokens'] = comments_df['comment_text'].apply(lambda x: clean_text_pipeline(x, use_lemmatization=True))
captions_df['cleaned_tokens'] = captions_df['caption_sentence'].apply(lambda x: clean_text_pipeline(x, use_lemmatization=True))


In [40]:
comments_df.to_csv('/content/drive/MyDrive/CSE477/comments_output.csv', index=False)

In [41]:
captions_df.to_csv('/content/drive/MyDrive/CSE477/captions_output.csv', index=False)