In [1]:
from googleapiclient.discovery import build
import re
import numpy as np
import pandas as pd
import joblib
import seaborn
import matplotlib.pyplot as plt
import datetime
import time
from datetime import date, datetime
from sentence_transformers import SentenceTransformer
import isodate

video_link = input()

API_KEY = "AIzaSyABnAkGOwwfFp0zSYuxh5qBclovq8dIG48"
youtube = build("youtube", "v3", developerKey=API_KEY)

def get_video_id(youtube_url):
    video_id = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", youtube_url)
    if video_id:
        return video_id.group(1)
    else:
        raise ValueError("Không tìm thấy video ID trong URL!")

def get_video_info(youtube_url):
    video_id = get_video_id(youtube_url)

    request = youtube.videos().list(
        part="snippet,statistics,contentDetails",
        id=video_id
    )
    response = request.execute()

    if not response["items"]:
        raise ValueError("Không tìm thấy video với ID này!")

    video_info = response["items"][0]["snippet"]
    content_details = response["items"][0]["contentDetails"]

    title = video_info.get("title", "")
    description = video_info.get("description", "")
    tags = video_info.get("tags", [])
    category_id = video_info.get("categoryId", "")

    category_request = youtube.videoCategories().list(
        part="snippet",
        id=category_id
    )
    category_response = category_request.execute()
    category = category_response["items"][0]["snippet"]["title"] if category_response["items"] else "Không xác định"

    publish_date = video_info.get("publishedAt", "")

    publish_datetime = datetime.strptime(publish_date, "%Y-%m-%dT%H:%M:%SZ")
    current_datetime = datetime.utcnow()
    days_since_publish = (current_datetime - publish_datetime).days

    duration_iso = content_details.get("duration", "PT0S")
    try:
        duration_seconds = isodate.parse_duration(duration_iso).total_seconds()
    except:
        duration_seconds = 0

    return {
        "title": title,
        "description": description,
        "tags": tags,
        "category_id": int(category_id),
        "category": category,
        "publish_date": publish_date,
        "days_since_publish": days_since_publish,
        "video_durations": duration_seconds
    }

try:
    video_info = get_video_info(video_link)
    days = video_info["days_since_publish"]
    duration = video_info["video_durations"]
    print("Title:", video_info["title"])
    print("Tags:", video_info["tags"])
    print("Category:", video_info["category"])
    print("Category ID:", video_info["category_id"])
    print("Publish Date:", video_info["publish_date"])
    print("Days since publish:", days)
    print("Video duration (seconds):", duration)
except Exception as e:
    print("Lỗi:", str(e))
    exit()

def clean_description(description):
    description = re.sub(r'http[s]?://\S+|www\.\S+', '', description)
    description = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', description)
    description = ' '.join(description.split())
    return description

video_info["description"] = clean_description(video_info["description"])
print("\nDescription (after cleaning):", video_info["description"])

ModuleNotFoundError: No module named 'isodate'

In [None]:
def combine_text(video_info):
    tags_str = " ".join(video_info["tags"]) if video_info["tags"] else ""
    return f"{video_info['title']} [SEP] {video_info['description']} [SEP] {tags_str}"

text_combined = combine_text(video_info)
print(text_combined)

In [None]:
model_bert = SentenceTransformer('all-MiniLM-L6-v2')
text_combined = video_info["title"] + " " + video_info["description"]
text_embedding = model_bert.encode([text_combined])[0]  # 384-dimensional vector

category_id = video_info["category_id"]
days_to_trending = video_info["days_since_publish"]
video_durations = video_info["video_durations"]

scaler_text = joblib.load('scaler_text_xgb.pkl')
scaler_category = joblib.load('scaler_category_xgb.pkl')
scaler_days = joblib.load('scaler_days_xgb.pkl')
scaler_duration = joblib.load('scaler_duration_xgb.pkl')

text_embedding = scaler_text.transform([text_embedding])[0]
category_id = scaler_category.transform([[category_id]])[0][0]
days_to_trending = scaler_days.transform([[days_to_trending]])[0][0]
video_durations = scaler_duration.transform([[video_durations]])[0][0]

X_combined = np.hstack([
    np.array([category_id]),
    np.array([days_to_trending]),
    np.array([video_durations]),
    text_embedding
]).reshape(1, -1)

model = joblib.load('xgboost_model.pkl')

prediction = model.predict_proba(X_combined)  # Xác suất cho từng lớp
predicted_class = np.argmax(prediction, axis=1)[0]
probabilities = prediction[0]

labels = {0: "Not popular", 1: "Controversy", 2: "Decent", 3: "Overwhelming positive"}

if predicted_class == 3:
    recommendation = "absolutely recommend"
elif predicted_class in [1, 2]:
    recommendation = "recommend"
else:
    recommendation = "not recommend"

print(f"\nPredicted label: {labels[predicted_class]}")
print(f"Recommendation: {recommendation}")
print("\nPhần trăm dự đoán cho từng nhãn:")
for i, prob in enumerate(probabilities):
    percentage = prob * 100
    print(f"{labels[i]}: {percentage:.2f}%")