### Loading the cleaned CommentText File

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_excel('/content/drive/MyDrive/youtube_sentiment_analysis/youtube_comments_english.xlsx')

In [None]:
df.shape

In [None]:
pd.set_option('display.max_colwidth',None)

In [None]:
df['CommentText'].sample(3)

### Bert Model Training

In [None]:
df.columns

In [None]:
pd.set_option('display.max_colwidth', False)

In [None]:
df[['Sentiment', 'CommentText']].sample(4)

In [None]:
df.shape

In [None]:
# Drop rows where either 'CommentText' or 'Sentiment' is NaN
df = df.dropna(subset=['CommentText', 'Sentiment'])

In [None]:
df.shape

In [None]:
# # Sample a subset (for speed during development)
# df = df.sample(50000, random_state=42).reset_index(drop=True)

In [None]:
pip install transformers pandas torch scikit-learn tqdm

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from transformers import DataCollatorWithPadding
from torch.cuda.amp import GradScaler, autocast
import os
# Set seed for reproducibility
torch.manual_seed(42)


In [None]:
# ====================== 2. Mount Google Drive ======================
from google.colab import drive
drive.mount('/content/drive')

# Folder to save/load model
model_folder = "/content/drive/MyDrive/youtube_sentiment_analysis"
model_path = os.path.join(model_folder, "bert-base-uncased_epoch7_20250520_203221.pt")
os.makedirs(model_folder, exist_ok=True)


In [None]:
# Encode sentiment labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Sentiment'])  # e.g., Negative=0, Neutral=1, Positive=2

# Split the dataset
train_df, val_df = train_test_split(df[['CommentText', 'label']], test_size=0.1, stratify=df['label'], random_state=42)

In [None]:
# ====================== 4. Tokenizer and Dataset ======================

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class YouTubeCommentsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,       # No fixed padding here
            max_length=128,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [None]:
train_dataset = YouTubeCommentsDataset(train_df['CommentText'].tolist(), train_df['label'].tolist(), tokenizer)
val_dataset = YouTubeCommentsDataset(val_df['CommentText'].tolist(), val_df['label'].tolist(), tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True, collate_fn=data_collator)


In [None]:
# ====================== 5. GPU Info ======================
print(torch.cuda.is_available(), torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


In [None]:
# ====================== 6. Load or Initialize Model ======================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
from torch import autocast,GradScaler

In [None]:
# 6. Load Pretrained BERT Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
scaler = GradScaler()


In [None]:
# Load model if checkpoint exists
if os.path.exists(model_path):
    print("🔄 Loading checkpoint from Drive...")
    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f"✅ Resumed from epoch {start_epoch}")

model.to(device)


In [None]:
model

In [None]:
import os
from datetime import datetime

# Set number of total epochs (adjust as needed)
TOTAL_EPOCHS = 10
bert_model_name = "bert-base-uncased"  # Change if needed
save_dir = "/content/drive/MyDrive/youtube_models_tracking_updating"  # Change to your save directory
os.makedirs(save_dir, exist_ok=True)

previous_model_path = None

for epoch in range(start_epoch, TOTAL_EPOCHS):
    # ---------- TRAINING ----------
    model.train()
    total_loss = 0
    train_bar = tqdm(train_loader, desc=f"🔁 Training Epoch {epoch + 1}")

    for batch in train_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        with autocast("cuda"):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        train_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"📘 Epoch {epoch + 1} Training Loss: {avg_loss:.4f}")

    # ---------- SAVING MODEL ----------
    # Remove previous saved model
    if previous_model_path and os.path.exists(previous_model_path):
        os.remove(previous_model_path)

    # Create new save path
    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    clean_name = bert_model_name.replace("/", "_")
    model_filename = f"{clean_name}_epoch{epoch+1}_{now}.pt"
    model_path = os.path.join(save_dir, model_filename)
    previous_model_path = model_path  # Save current for deletion next time

    # Save model state
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, model_path)
    print(f"💾 Model saved: {model_path}")

    # ---------- VALIDATION ----------
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="🔍 Validating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with autocast("cuda"):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    val_acc = 100 * correct / total
    print(f"🎯 Validation Accuracy after epoch {epoch + 1}: {val_acc:.2f}%\n")


#### earlier epochs accuracy performed better

In [None]:
# Set number of total epochs (adjust as needed)
TOTAL_EPOCHS = 5

for epoch in range(start_epoch, TOTAL_EPOCHS):
    # ---------- TRAINING ----------
    model.train()
    total_loss = 0
    train_bar = tqdm(train_loader, desc=f"🔁 Training Epoch {epoch + 1}")

    for batch in train_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        with autocast("cuda"):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        train_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"📘 Epoch {epoch + 1} Training Loss: {avg_loss:.4f}")

    # ---------- SAVING MODEL ----------
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, model_path)
    print(f"💾 Model saved to Drive after epoch {epoch + 1}")

    # ---------- VALIDATION ----------
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="🔍 Validating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with autocast("cuda"):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    val_acc = 100 * correct / total
    print(f"🎯 Validation Accuracy after epoch {epoch + 1}: {val_acc:.2f}%\n")


### Model deplot Gradio Interface

In [None]:
pip install gradio

In [None]:
import gradio as gr
from googleapiclient.discovery import build
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import matplotlib.pyplot as plt
from datetime import datetime
from collections import defaultdict
import numpy as np

# Setup device and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load checkpoint
model_path = "/content/drive/MyDrive/youtube_models_tracking_updating/bert-base-uncased_epoch10_20250522_174142.pt"
checkpoint = torch.load(model_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()

# YouTube API client (replace with your API key)
YOUTUBE_API_KEY = "AIzaSyACdqm45EvkiSVHsQjold_mz8e2QrvpJ_0"
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)


def extract_video_id(url):
    if "v=" in url:
        return url.split("v=")[1].split("&")[0]
    elif "youtu.be/" in url:
        return url.split("youtu.be/")[1].split("?")[0]
    else:
        return None


def fetch_comments(video_url, max_comments=3000):
    video_id = extract_video_id(video_url)
    if not video_id:
        return []

    comments = []
    next_page_token = None

    while len(comments) < max_comments:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=100,
            textFormat="plainText",
            pageToken=next_page_token
        )
        response = request.execute()
        for item in response.get("items", []):
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)
            if len(comments) >= max_comments:
                break
        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return comments[:max_comments]


def fetch_video_stats(video_url):
    video_id = extract_video_id(video_url)
    if not video_id:
        return None

    request = youtube.videos().list(
        part="statistics,snippet",
        id=video_id
    )
    response = request.execute()
    if not response["items"]:
        return None

    item = response["items"][0]
    stats = item.get("statistics", {})
    snippet = item.get("snippet", {})

    # Likes and dislikes (dislikes may be disabled)
    like_count = int(stats.get("likeCount", 0))
    dislike_count = int(stats.get("dislikeCount", 0))  # Note: YouTube API no longer provides dislikes publicly

    # Published date
    published_at = snippet.get("publishedAt", None)
    published_date = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ") if published_at else None

    return {
        "likes": like_count,
        "dislikes": dislike_count,
        "published_date": published_date
    }


def predict_sentiment(texts):
    sentiments = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            pred = torch.argmax(probs, dim=1).item()
        if pred == 0:
            sentiments.append("Negative")
        elif pred == 1:
            sentiments.append("Neutral")
        else:
            sentiments.append("Positive")
    return sentiments


def group_by_weeks(published_date, sentiments):
    """
    Create a fake trend by distributing comments weekly starting from video publish date.
    This is because real comment timestamps are not fetched here.
    """
    if not published_date:
        return {}

    weeks_data = defaultdict(lambda: {"Positive": 0, "Neutral": 0, "Negative": 0})

    # Just distribute comments evenly over 12 weeks (example)
    total_weeks = 12
    comments_per_week = max(1, len(sentiments) // total_weeks)

    for idx, sentiment in enumerate(sentiments):
        week_num = idx // comments_per_week
        if week_num >= total_weeks:
            week_num = total_weeks - 1
        weeks_data[week_num][sentiment] += 1

    return weeks_data


def plot_sentiment_trend(weeks_data):
    weeks = sorted(weeks_data.keys())
    positives = [weeks_data[w]["Positive"] for w in weeks]
    neutrals = [weeks_data[w]["Neutral"] for w in weeks]
    negatives = [weeks_data[w]["Negative"] for w in weeks]

    plt.figure(figsize=(10,5))
    plt.plot(weeks, positives, label="Positive", color='green', marker='o')
    plt.plot(weeks, neutrals, label="Neutral", color='gray', marker='o')
    plt.plot(weeks, negatives, label="Negative", color='red', marker='o')
    plt.xlabel("Weeks since publish")
    plt.ylabel("Number of comments")
    plt.title("Sentiment Trend Over Time (Approximate)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("sentiment_trend.png")
    plt.close()


def plot_likes_dislikes_over_time(stats, weeks_data):
    """
    Because we have only one snapshot of likes/dislikes, simulate likes/dislikes over weeks.
    Here, we fake data: likes increase linearly, dislikes fixed or zero (since dislikes API is deprecated)
    """
    weeks = sorted(weeks_data.keys())
    total_weeks = len(weeks)

    likes = np.linspace(0, stats["likes"], total_weeks)
    dislikes = np.linspace(0, stats["dislikes"], total_weeks) if stats["dislikes"] > 0 else np.zeros(total_weeks)

    plt.figure(figsize=(10,5))
    plt.plot(weeks, likes, label="Likes", color='blue', marker='o')
    plt.plot(weeks, dislikes, label="Dislikes", color='orange', marker='o')
    plt.xlabel("Weeks since publish")
    plt.ylabel("Count")
    plt.title("Likes & Dislikes Over Time (Simulated)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("likes_dislikes_trend.png")
    plt.close()


def analyze_comments(video_url):
    comments = fetch_comments(video_url, max_comments=3000)
    if not comments:
        return "Invalid or no comments found for the video URL.", None, None, {}, {}

    sentiments = predict_sentiment(comments)

    counts = {"Positive": 0, "Neutral": 0, "Negative": 0}
    for s in sentiments:
        counts[s] += 1

    # Plot sentiment pie chart
    labels = list(counts.keys())
    sizes = list(counts.values())
    colors = ['green', 'gray', 'red']
    plt.figure(figsize=(5,5))
    plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
    plt.axis('equal')
    plt.title('Sentiment Distribution')
    plt.tight_layout()
    plt.savefig("sentiment_distribution.png")
    plt.close()

    detailed_comments = {
        "Positive": [c for c, s in zip(comments, sentiments) if s == "Positive"],
        "Neutral": [c for c, s in zip(comments, sentiments) if s == "Neutral"],
        "Negative": [c for c, s in zip(comments, sentiments) if s == "Negative"],
    }

    # Fetch video stats (likes, dislikes, publish date)
    stats = fetch_video_stats(video_url)

    # Group sentiments by week (approximate)
    weeks_data = group_by_weeks(stats["published_date"] if stats else None, sentiments)

    # Plot trends
    plot_sentiment_trend(weeks_data)
    if stats:
        plot_likes_dislikes_over_time(stats, weeks_data)
    else:
        # Create blank plot if no stats
        plt.figure(figsize=(10,5))
        plt.text(0.5, 0.5, 'No likes/dislikes data available', horizontalalignment='center', verticalalignment='center')
        plt.savefig("likes_dislikes_trend.png")
        plt.close()

    return (
        f"Total comments analyzed: {len(comments)}",
        "sentiment_distribution.png",
        "sentiment_trend.png",
        "likes_dislikes_trend.png",
        detailed_comments,
        counts
    )


with gr.Blocks() as demo:
    gr.Markdown("# YouTube Comments Real-Time Sentiment Analysis with Trend Tracking")
    video_url_input = gr.Textbox(label="Enter YouTube Video URL", placeholder="https://www.youtube.com/watch?v=...")
    analyze_btn = gr.Button("Analyze Sentiments")

    total_comments = gr.Textbox(label="Summary", interactive=False)
    sentiment_chart = gr.Image(label="Sentiment Distribution")
    sentiment_trend_chart = gr.Image(label="Sentiment Trend Over Time")
    likes_dislikes_chart = gr.Image(label="Likes & Dislikes Over Time (Simulated)")

    positive_comments = gr.Textbox(label="Positive Comments", interactive=False)
    neutral_comments = gr.Textbox(label="Neutral Comments", interactive=False)
    negative_comments = gr.Textbox(label="Negative Comments", interactive=False)

    counts_display = gr.JSON(label="Sentiment Counts")

    def update_ui(video_url):
        summary, pie_chart, trend_chart, likes_chart, comments_dict, counts = analyze_comments(video_url)
        if pie_chart is None:
            return summary, None, None, None, "", "", "", {}, {}
        return summary, pie_chart, trend_chart, likes_chart, \
               "\n\n---\n\n".join(comments_dict["Positive"]), \
               "\n\n---\n\n".join(comments_dict["Neutral"]), \
               "\n\n---\n\n".join(comments_dict["Negative"]), counts

    analyze_btn.click(
        fn=update_ui,
        inputs=video_url_input,
        outputs=[total_comments, sentiment_chart, sentiment_trend_chart, likes_dislikes_chart,
                 positive_comments, neutral_comments, negative_comments, counts_display]
    )

demo.launch()


### Uploading my model to the Hugging face

In [None]:
!pip install -q huggingface_hub


In [None]:
from huggingface_hub import login

login(token="***************************")


In [None]:
from googleapiclient.discovery import build
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import matplotlib.pyplot as plt
from datetime import datetime
from collections import defaultdict
import numpy as np

# Setup device and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load checkpoint
model_path = "/content/drive/MyDrive/youtube_sentiment_analysis/checkpoint112_epoch4.pt"
checkpoint = torch.load(model_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()

In [None]:
# Prepare Your Model for Upload

model.save_pretrained("youtube_sentiment_bert")
tokenizer.save_pretrained("youtube_sentiment_bert")


In [None]:
# Push to Hugging Face Hub
from huggingface_hub import create_repo, upload_folder

# Optional: create the repo (only needed once)
create_repo(repo_id="nitish-11/youtube_sentiment_analysis_bert", private=True)

# Upload local folder to Hugging Face Hub
upload_folder(
    repo_id="nitish-11/youtube_sentiment_analysis_bert",
    folder_path="youtube_sentiment_bert",
    commit_message="Initial model upload"
)
