In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Replace 'url_here' with the URL of the page containing the table
url = 'https://mattw.io/youtube-metadata/bulk?url=https%3A%2F%2Fwww.youtube.com%2Fplaylist%3Flist%3DPL3HhMTCWB-e_FA-VkkE_gn-JbOVWfklql&submit=true'

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')

# Find all tables on the page
tables = soup.find('#videosTable')

tables

In [3]:
soup

<!DOCTYPE html>

<html class="dark" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="Quickly gather all the metadata about a video, playlist, or channel from the YouTube API. Reverse image search thumbnails, geolocate in google maps, and translate ISO country and language codes." name="description"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<link href="./img/icon.png" rel="icon"/>
<title>MW Metadata Bulk</title>
<link crossorigin="" href="//cdn.jsdelivr.net" rel="preconnect"/>
<!-- Dependencies -->
<link href="//cdn.datatables.net/1.11.3/css/dataTables.bootstrap5.min.css" rel="stylesheet"/>
<link href="//cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.3.1/build/styles/stackoverflow-dark.min.css" id="highlightjs-theme" rel="stylesheet"/>
<link href="//cdn.jsdelivr.net/npm/bootstrap-dark-5@1.1.3/dist/css/bootstrap-nightshade.min.css" rel="stylesheet"/>
<link href="//cdn.jsdelivr.net/npm/bootstrap-icons@1.5.0/font/bootstrap-icons.css" 

In [5]:
data = pd.read_csv('The Right Stand - CNN-News18 - Sheet1.csv')
data

Unnamed: 0,Title,Author,Length,Published,Views,Likes,Comments,Privacy Status,Region Restriction Count,Tag Count
0,Outsider No More?Why Bharat Ratna for Narasimh...,CNN-News18,21m 33s,2024-02-09T16:19:09Z,10068,88,42,public,,14
1,Parliament Debates On White Paper Tabled By Ni...,CNN-News18,32m 2s,2024-02-09T16:14:29Z,1286,4,1,public,,26
2,Ballot In Pakistan: Gun Raj In Pakistan Occupi...,CNN-News18,18m 26s,2024-02-08T17:21:08Z,688,3,0,public,,21
3,White Vs Black Paper: Economy Debate Settled? ...,CNN-News18,33m 56s,2024-02-08T17:07:28Z,5193,47,8,public,,27
4,UCC Showdown: Uttarakhand's Template For Bhara...,CNN-News18,26m 4s,2024-02-07T15:37:35Z,3898,23,7,public,,23
...,...,...,...,...,...,...,...,...,...,...
3154,Centre Likely To Move Triple Talaq Bill In Raj...,CNN-News18,1m 42s,2019-07-29T17:30:59Z,719,11,1,public,1 (allowed),24
3155,Is Your Data Safety Being Compromised For U.S ...,CNN-News18,33m 31s,2019-07-29T17:26:45Z,260,4,0,public,1 (allowed),29
3156,Didi Unveils And Garlands New Vidyasagar Bust ...,CNN-News18,29m 12s,2019-06-12T02:21:28Z,284,7,1,public,1 (allowed),28
3157,"Is Congress Losing 'Sabka Vishwas', With Owais...",CNN-News18,44m 21s,2019-06-11T02:02:24Z,1161,17,5,public,1 (allowed),23


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# Preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Rejoin tokens into a string
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

data['Processed_Title'] = data['Title'].apply(preprocess_text)

# Vectorize the text
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['Processed_Title'])

# Perform KMeans clustering
# The number of clusters (n_clusters) is set arbitrarily - you might need to adjust this
kmeans = KMeans(n_clusters=7, random_state=42)
kmeans.fit(X)

# Assign the cluster labels to the original data
data['Cluster'] = kmeans.labels_

# Check the size of each cluster
cluster_sizes = data['Cluster'].value_counts()

# Inspect a few titles from each cluster to determine the category
for cluster_num in range(7):
    print(f"Cluster {cluster_num} titles:")
    print(data[data['Cluster'] == cluster_num]['Title'].sample(5))
    print()

# You will need to manually inspect the output and determine what category each cluster represents.


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/meghnadbose/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/meghnadbose/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Cluster 0 titles:
1685    Politics Over Cooch Behar Incident | The Right...
2942    First Hyderabad, Now Unnao: Another 'Beti' Bet...
1400    Choksi Case | Sujay Kantawala Shares His Views...
1867    Is Chinnamma's Sacrifice An Advantage For AIAD...
3128    Is India's Aura Under PM Modi Brighter Than Ev...
Name: Title, dtype: object

Cluster 1 titles:
1610    India Fights COVID | Time To Call Out Local Le...
1996    BJP MP Aparajita Sarangi Says We Need Politica...
2727    Time To Recalibrate Military Mindset? | The Ri...
3125    Has The Time Arrived For The Global Blacklisti...
861     Karnataka News | Praveen Killing Outrage | Tim...
Name: Title, dtype: object

Cluster 2 titles:
1405    Mehul Choksi Scam | Upper hand for India? | La...
736     Dr. Vikram Sampath In An Exclusive Chat On His...
478     California News | California Gurudwara Shootin...
760     Jammu Kashmir News Latest | Naya Kashmir Lates...
582     Delhi News | Arrested AAP Ministers Manish Sis...
Name: Title, dtype: 

In [7]:
# Inspect a few titles from each cluster to determine the category
for cluster_num in range(7):
    print(f"Cluster {cluster_num} titles:")
    titles_sample = data[data['Cluster'] == cluster_num]['Title'].head(10)  # Adjust the number sampled as needed
    for title in titles_sample:
        print(title)
    print("\n")

# You will need to manually inspect the output and determine what category each cluster represents.
# After inspection, create a mapping from cluster number to category name.
cluster_to_category = {
    0: 'Category Name 1',  # Replace with actual category name after inspection
    1: 'Category Name 2',  # Replace with actual category name after inspection
    # ... Continue for each cluster
    6: 'Category Name 7'   # Replace with actual category name after inspection
}

# Apply the mapping to the cluster labels to create a 'Category' column
data['Category'] = data['Cluster'].map(cluster_to_category)

# Check the distribution of the new categorical labels
category_distribution = data['Category'].value_counts()
print(category_distribution)


Cluster 0 titles:
Do We Need To Learn Lessons On 'Sustainability'? | The Right Stand With Anand Naramsihan | News18
Kashmir Files Unreported | Vivek Agnihotri's Exclusive Interview On News18 With Anand Narasimhan
28% GST On Online Gaming | Ashneer Grover Exclusive Interview With Anand Narasimhan On News18
Piyush Goyal Speaks To News18's Anand Narasimhan On UCC, Opposition Unity And 2024 Elections
The Right Stand With Anand Narasimhan, Because India Deserves Better! | The Right Stand On News18
Punjab News: Who Let Amritpal Singh Escape? | The Right Stand With Anand Narasimhan | News18
Armaan Malik Interview | Armaan Malik Exclusive With Anand Narasimhan | Times Square | News18
Punjab News Today | Khalsa Call, Khalistan Intent? | The Right Stand With Anand Narasimhan | News18
Armaan Malik Interview | Armaan Malik Exclusive | The Right Stand With Anand Narasimhan | News18
CCP's Duplicity | Wang Yi's "Bhai Bhai" Sentiment For India | The Right Stand With Anand Narasimhan


Cluster 1 titles

In [8]:
# Define keywords for each category
category_keywords = {
    'Attacking the Opposition': ['opposition', 'congress', 'aap', 'left', 'critic', 'attack'],
    'Communalism': ['hindu', 'muslim', 'communal', 'hindutva', 'anti-muslim', 'sectarian'],
    'Sports and/or Entertainment': ['cricket', 'hockey', 'football', 'movie', 'entertainment', 'bollywood', 'sports'],
    'Economy': ['economy', 'gdp', 'budget', 'finance', 'economic', 'growth', 'recession'],
    'Foreign Relations': ['china', 'us', 'russia', 'international', 'relations', 'foreign', 'diplomacy', 'trade', 'nepal', 'bangladesh', 'sri lanka'],
    'Pakistan': ['pakistan'],
    'Praising Modi': ['modi', 'narendra', 'bjp', 'prime minister', 'leadership', 'praise'],
    # 'Others' will not have keywords as it is a default category
}

# Function to categorize titles based on keywords
def categorize_title(title):
    title_lower = title.lower()
    for category, keywords in category_keywords.items():
        if any(keyword in title_lower for keyword in keywords):
            return category
    return 'Others'

# Apply the categorization function to each title
data['Category'] = data['Title'].apply(categorize_title)

# Count the number of videos in each category
category_counts = data['Category'].value_counts()

# Print the category counts
print(category_counts)


Category
Others                         1671
Foreign Relations               697
Attacking the Opposition        253
Praising Modi                   245
Communalism                     133
Pakistan                         80
Economy                          41
Sports and/or Entertainment      39
Name: count, dtype: int64
