In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity

In [1]:
!pip install --upgrade gdown

Collecting gdown
  Downloading gdown-5.1.0-py3-none-any.whl (17 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.7.3
    Uninstalling gdown-4.7.3:
      Successfully uninstalled gdown-4.7.3
Successfully installed gdown-5.1.0


In [9]:
!gdown 11V7fT7uCQdxVSmg9iWIpknDGK04TfPYRLal1IHkcu04

Downloading...
From (original): https://drive.google.com/uc?id=11V7fT7uCQdxVSmg9iWIpknDGK04TfPYRLal1IHkcu04
From (redirected): https://docs.google.com/spreadsheets/d/11V7fT7uCQdxVSmg9iWIpknDGK04TfPYRLal1IHkcu04/export?format=xlsx
To: /content/Recommend videos sample.xlsx
0.00B [00:00, ?B/s]173kB [00:00, 2.83MB/s]


In [11]:
rcm_path = r'Recommend videos sample.xlsx'
recommend_data = pd.read_excel(rcm_path)
recommend_data.tail()

Unnamed: 0,Title_Description,View Count,Like Count,Topic,Title,tokens,clean_title
995,Bach Study Music Playlist Instrumental Classic...,4352440,33238,Music,Bach Study Music Playlist 🎻 Instrumental Class...,"['bach', 'study', 'music', 'playlist', 'instru...",bach study music playlist instrumental classic...
996,Working of institutions class full chapter Cla...,2641564,84615,NonProfit & Activism,Working of institutions class 9 full chapter (...,"['working', 'institution', 'class', 'full', 'c...",working institution class full chapter class c...
997,Saudi Arabia Introduces First Female Sara SAUD...,126202,1852,Science & Technology,Saudi Arabia Introduces First Al-Powered Femal...,"['saudi', 'arabia', 'introduces', 'first', 'fe...",saudi arabia introduces first female sara saud...
998,Bouncing Seals Ultimate The seals are Blowout,6428417,123227,Animals,Bouncing Seals Ultimate 2,"['bouncing', 'seal', 'ultimate', 'seal', 'blow...",bouncing seal ultimate seal blowout
999,Success Story Augmented Reality wayfinding wit...,2883,31,Science & Technology,Success Story Augmented Reality wayfinding wit...,"['success', 'story', 'augmented', 'reality', '...",success story augmented reality wayfinding vie...


In [12]:
video = recommend_data['clean_title']
topic = recommend_data['Topic']
tokens = recommend_data['tokens']
re_title = recommend_data['Title']
dataset_videos = video.tolist()

In [13]:
recommend_df = recommend_data[['clean_title', 'Topic']].copy()
recommend_df.dropna(inplace=True)
dataset_videos = recommend_df['clean_title'].tolist()
topic = recommend_df['Topic']

In [14]:
# Tạo và cấu hình vectorizer
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
vectorizer.fit(dataset_videos)

classifier = LogisticRegression()
classifier.fit(vectorizer.transform(dataset_videos), topic)

In [15]:
def classify_video_title(title, classifier, vectorizer):
    # Transform the title into a feature vector
    title_vector = vectorizer.transform([title])
    # Predict the label using the classifier model
    predicted_label = classifier.predict(title_vector)[0]
    return predicted_label

In [16]:
def compute_similarity(title, videos, labels, predicted_label, vectorizer, recommended_title):
    same_label_indices = [i for i, label in enumerate(labels) if label == predicted_label]
    same_label_videos = videos.iloc[same_label_indices].dropna().tolist()
    same_label_parameters = recommended_title.iloc[same_label_indices]

    title_vector = vectorizer.transform([title])
    same_label_vectors = vectorizer.transform(same_label_videos)

    similarities = cosine_similarity(title_vector, same_label_vectors)

    top_indices = np.argsort(similarities.flatten())[-5:]
    top_videos = same_label_parameters.iloc[top_indices]
    return top_videos

In [17]:
# Người dùng nhập tựa đề video
user_input = input('Nhập đoạn văn cần phân loại: ')

# Phân loại tựa đề video
predicted_label = classify_video_title(user_input, classifier, vectorizer)

# Tính độ tương đồng và gợi ý video
recommended_videos = compute_similarity(user_input, recommend_df['clean_title'], topic , predicted_label, vectorizer, re_title )


# Hiển thị kết quả gợi ý
print('Video thuộc topic:', predicted_label)
print('Gợi ý 5 video có độ tương đồng cao nhất:')
for i, video in enumerate(recommended_videos, 1):
    print(f'  {i}. {video}')

Nhập đoạn văn cần phân loại: ádafsdfasfasdassdbajsgdgjasdg
Video thuộc topic: Sports
Gợi ý 5 video có độ tương đồng cao nhất:
  1. I Watched Messi Win The World Cup LIVE
  2. Cricket Run out😍 whatsapp status || Indian cricket status || Cricket status 🌹Awesome #Cricket#Short
  3. Part-2 | Entry On Wwe Theme Song 🤣 #shorts
  4. Camera Man Beats Olympic Runners in a 10k Race!!! #shorts
  5. বাঙালি 🎇 ছেলের পায়ের 🔥অসাধারণ 😯 গোল ⚽ #messifootball #vairalvideo #bangal #football #volleyball
