In [6]:
# Importing libraries
import re
import sys
import json
import praw
import math
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.lda_model
from itertools import chain
from textblob import TextBlob
from wordcloud import WordCloud
from collections import Counter
from praw.models import MoreComments

# Importing Scikit-learn libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Importing NLTK libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer




# Downloading NLTK stopwords
nltk.download('stopwords')
# Downloading NLTK vader lexicon
nltk.download('vader_lexicon')
# Downloading NLTK punkt tokenizer
nltk.download('punkt')
nltk.download('punkt_tab')

# Ignoring warnings
import warnings
warnings.filterwarnings("ignore")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mayan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mayan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mayan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use("seaborn-v0_8")

# Setting aesthetic parameters for plots
sns.set_theme(style="whitegrid")
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['figure.figsize'] = (16, 10)
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['axes.labelweight'] = 'bold'

In [8]:

# Path to your JSON file
file_path = '../Data/youtube_merged_all_videos_with_comments.json'


# Load the JSON data
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Build a list of records: one 'video' row (title+description) followed by its comments
records = []
for video in data:
    vid = video.get('videoId', '')
    title = video.get('title', '')
    desc = video.get('description', '')
    channelTitle = video.get('channelTitle', '')
    videoPublishedAt = video.get('videoPublishedAt', '')
    tags = video.get('tags', [])
    viewCount = video.get('viewCount', 0)
    videoLikeCount = video.get('videoLikeCount', 0)
    videoCommentCount = video.get('videoCommentCount', 0)
    comment_count = len(video.get('comments', []))

    # First row: title + description
    records.append({
        'videoId': vid,
        'title': title,
        'description': desc,
        'channelTitle': channelTitle,
        'videoPublishedAt': videoPublishedAt,
        'tags': tags,
        'viewCount': viewCount,
        'videoLikeCount': videoLikeCount,
        'videoCommentCount': videoCommentCount,
        'text': f"{title}\n\n{desc}",
        'comment_count': comment_count,

    })
    # Subsequent rows: each comment
    for comment in video.get('comments', []):
        comment_text = comment.get('text', '')
        comment_author = comment.get('author', '')
        comment_published_at = comment.get('publishedAt', '')
        comment_like_count = comment.get('likeCount', 0)
        records.append({
            'videoId': vid,
            'text': comment_text,
            'author': comment_author,
            'publishedAt': comment_published_at,
            'likeCount': comment_like_count,
            'type': 'comment'
        })


        
# Create DataFrame
df_vid_comments = pd.DataFrame(records)


In [9]:
df_vid_comments.head(5)

Unnamed: 0,videoId,title,description,channelTitle,videoPublishedAt,tags,viewCount,videoLikeCount,videoCommentCount,text,comment_count,author,publishedAt,likeCount,type
0,2JsmSs3oPjc,Russia in 'far worse position now' than during...,"Russia is in a ""far worse position now"" compar...",Sky News,2025-05-16T14:37:44Z,ukraine;russia;war;conflict;analysis;professor...,392688.0,4944.0,1558.0,Russia in 'far worse position now' than during...,954.0,,,,
1,2JsmSs3oPjc,,,,,,,,,I only clicked to see how the bias from sky wa...,,@NeilCubbage,2025-05-22T10:44:58Z,0.0,comment
2,2JsmSs3oPjc,,,,,,,,,The Russians have moved 30 miles through heavy...,,@haydnjones4816,2025-05-22T09:33:14Z,0.0,comment
3,2JsmSs3oPjc,,,,,,,,,I'm tired of this guy,,@incognito_one876,2025-05-22T09:10:04Z,0.0,comment
4,2JsmSs3oPjc,,,,,,,,,Clarke is a Govt propaganda stooge from Chatha...,,@saviour5091,2025-05-22T08:43:16Z,0.0,comment
