In [None]:
from googleapiclient.discovery import build
import re
import numpy as np
import pandas as pd
import joblib
import seaborn
import matplotlib.pyplot as plt
import datetime
import time
from datetime import date, datetime
from sentence_transformers import SentenceTransformer
from tensorflow.keras.models import load_model
import isodate

video_link = input("Nhập URL video YouTube: ")

API_KEY = "AIzaSyABnAkGOwwfFp0zSYuxh5qBclovq8dIG48"
youtube = build("youtube", "v3", developerKey=API_KEY)

def get_video_id(youtube_url):
    video_id = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", youtube_url)
    if video_id:
        return video_id.group(1)
    else:
        raise ValueError("Không tìm thấy video ID trong URL!")

def get_video_info(youtube_url):
    video_id = get_video_id(youtube_url)

    request = youtube.videos().list(
        part="snippet,statistics,contentDetails",
        id=video_id
    )
    response = request.execute()

    if not response["items"]:
        raise ValueError("Không tìm thấy video với ID này!")

    video_info = response["items"][0]["snippet"]
    content_details = response["items"][0]["contentDetails"]

    title = video_info.get("title", "")
    description = video_info.get("description", "")
    tags = video_info.get("tags", [])
    category_id = video_info.get("categoryId", "")

    category_request = youtube.videoCategories().list(
        part="snippet",
        id=category_id
    )
    category_response = category_request.execute()
    category = category_response["items"][0]["snippet"]["title"] if category_response["items"] else "Không xác định"

    publish_date = video_info.get("publishedAt", "")

    publish_datetime = datetime.strptime(publish_date, "%Y-%m-%dT%H:%M:%SZ")
    current_datetime = datetime.utcnow()
    days_since_publish = (current_datetime - publish_datetime).days

    duration_iso = content_details.get("duration", "PT0S")
    try:
        duration_seconds = isodate.parse_duration(duration_iso).total_seconds()
    except:
        duration_seconds = 0

    return {
        "title": title,
        "description": description,
        "tags": tags,
        "category_id": int(category_id),
        "category": category,
        "publish_date": publish_date,
        "days_since_publish": days_since_publish,
        "video_durations": duration_seconds
    }

def clean_description(description):
    description = re.sub(r'http[s]?://\S+|www\.\S+', '', description)
    description = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', description)
    description = ' '.join(description.split())
    return description

try:
    video_info = get_video_info(video_link)
    days = video_info["days_since_publish"]
    duration = video_info["video_durations"]
    print("Title:", video_info["title"])
    print("Tags:", video_info["tags"])
    print("Category:", video_info["category"])
    print("Category ID:", video_info["category_id"])
    print("Publish Date:", video_info["publish_date"])
    print("Days since publish:", days)
    print("Video duration (seconds):", duration)
    video_info["description"] = clean_description(video_info["description"])
    print("\nDescription (after cleaning):", video_info["description"])
except Exception as e:
    print("Lỗi:", str(e))
    exit()




Nhập URL video YouTube:  https://www.youtube.com/watch?v=JGwWNGJdvx8


Title: Ed Sheeran - Shape of You (Official Music Video)
Tags: ['Ed Sheeran', 'Shape Of You', 'Ed Sheeran Shape Of You', 'Shape Of You Song', 'Ed Sheeran Official Video', 'Shape Of You Official Video', 'Ed Sheeran music', 'Ed Sheeran Lyrics', 'Shape Of You Lyrics', 'Ed Sheeran 2017', 'divide', 'Ed Sheeran divide', 'Ed Sheeran third album', 'Ed Sheeran 3rd album', 'Ed Sheeran songs', 'Pop', 'Pop Music', 'Ed', 'Sheeran', 'Ed Sheerin', 'EdSheeran', 'Ed Sheran', 'English Songs', 'English Song', "i'm in love with the shape of you", "i'm in love with your body", 'एड शीरन', 'तुम्हारी सूरत']
Category: Music
Category ID: 10
Publish Date: 2017-01-30T10:57:50Z
Days since publish: 3022
Video duration (seconds): 264.0

Description (after cleaning): The official music video for Ed Sheeran Shape Of You Taken from the studio album divide released in 2017, which featured the hit singles Castle on the Hill, Shape of You, Galway Girl, Perfect Happier. Subscribe to the Ed Sheeran channel for all the best a

In [None]:
model_bert = SentenceTransformer('all-MiniLM-L6-v2')
text_combined = video_info["title"] + " " + video_info["description"]
text_embedding = model_bert.encode([text_combined])[0]  # 384-dimensional vector

category_id = video_info["category_id"]
days_to_trending = video_info["days_since_publish"]
video_durations = video_info["video_durations"]


scaler_text = joblib.load('scaler_text.pkl')
scaler_category = joblib.load('scaler_category.pkl')
scaler_days = joblib.load('scaler_days.pkl')
scaler_duration = joblib.load('scaler_duration.pkl')

text_embedding = scaler_text.transform([text_embedding])[0]  # Shape: (384,)
category_id = scaler_category.transform([[category_id]])[0][0]  # Scalar
days_to_trending = scaler_days.transform([[days_to_trending]])[0][0]  # Scalar
video_durations = scaler_duration.transform([[video_durations]])[0][0]  # Scalar

X_text = np.array([text_embedding])  # Shape: (1, 384)
X_category = np.array([[category_id]])  # Shape: (1, 1)
X_days = np.array([[days_to_trending]])  # Shape: (1, 1)
X_duration = np.array([[video_durations]])  # Shape: (1, 1)

model = load_model('duration_neural_network.h5')

prediction = model.predict([X_text, X_category, X_days, X_duration])  # Shape: (1, 4)
predicted_class = np.argmax(prediction, axis=1)[0]
probabilities = prediction[0]

labels = {0: "Not popular", 1: "Controversy", 2: "Decent", 3: "Overwhelming positive"}

if predicted_class == 3:
    recommendation = "absolutely recommend"
elif predicted_class in [1, 2]:
    recommendation = "recommend"
else:
    recommendation = "not recommend"

print(f"\nPredicted label: {labels[predicted_class]}")
print(f"Recommendation: {recommendation}")
print("\nPhần trăm dự đoán cho từng nhãn:")
for i, prob in enumerate(probabilities):
    percentage = prob * 100
    print(f"{labels[i]}: {percentage:.2f}%")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step

Predicted label: Decent
Recommendation: recommend

Phần trăm dự đoán cho từng nhãn:
Not popular: 1.79%
Controversy: 0.00%
Decent: 98.21%
Overwhelming positive: 0.00%
