In [None]:
!pip install sentence-transformers>=2.7.0 transformers>=4.51.0 accelerate -q

# IMPORT LIBRARIES

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
import pickle
import json
import os
import torch
from google.colab import files

# MOUNT TO GOOGLE DRIVE

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print("Libraries Imported and Drive Mounted!!!")

Mounted at /content/drive
Libraries Imported and Drive Mounted!!!


# LOAD DATA

In [None]:
try:
  file_path = '/content/drive/My Drive/TDATASETS/kdrama.csv'
  df = pd.read_csv(file_path)
  print("Dataset loaded successfully!!!")
  print(f'Dataset Shape: {df.shape}')
except FileNotFoundError:
  print("File not found!")
  df = None

Dataset loaded successfully!!!
Dataset Shape: (250, 17)


# CLEANING

In [None]:
if df is not None:
    feature_columns = ['Synopsis', 'Genre', 'Tags', 'Cast', 'Name']
    for col in feature_columns:
        df[col] = df[col].fillna('')

    def clean_text(text):
        text = str(text)
        text = re.sub(r'[^\w\s,]', '', text)
        text = text.lower()
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    df['clean_features'] = (
        (df['Synopsis'].apply(clean_text) + ' ') * 3 +
        df['Genre'].apply(clean_text) + ' ' +
        df['Tags'].apply(clean_text) + ' ' +
        df['Cast'].apply(clean_text) * 2
    )
    print("\n'clean_features' column created.")
    corpus = df['clean_features'].tolist()


'clean_features' column created.


# LOAD PRE-TRAINED MODEL AND GENERATE EMBEDDINGS

In [None]:
if df is not None:
    model_name = 'Qwen/Qwen3-Embedding-0.6B'
    print(f"\nLoading Sentence Transformer model: {model_name}...")

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")
    model = SentenceTransformer(model_name, device=device)
    print("Model loaded.")

    print("\nGenerating embeddings... (This may take a few minutes)")
    embeddings = model.encode(corpus, show_progress_bar=True, batch_size=16)
    print(f"Embeddings generated successfully! Shape: {embeddings.shape}")


Loading Sentence Transformer model: Qwen/Qwen3-Embedding-0.6B...
Using device: cuda
Model loaded.

Generating embeddings... (This may take a few minutes)


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Embeddings generated successfully! Shape: (250, 1024)


# SAVE EMBEDDINGS

In [None]:
if df is not None:
    print("\nCalculating cosine similarity matrix...")
    cosine_sim = cosine_similarity(embeddings)
    print(f"Similarity matrix calculated! Shape: {cosine_sim.shape}")

    dramas_data = df[['Name', 'Genre', 'Tags']].to_dict('records')
    similarity_list = cosine_sim.tolist()

    output_data_json = 'kdrama_data_QWEN_V3.json'
    output_matrix_json = 'kdrama_similarity_matrix_QWEN_V3.json'

    with open(output_data_json, 'w') as f:
        json.dump(dramas_data, f)
    print(f"\n K-Drama data saved as: {output_data_json}")

    with open(output_matrix_json, 'w') as f:
        json.dump(similarity_list, f)
    print(f"Similarity matrix saved as: {output_matrix_json}")

    print("\nDownloading files to your local computer...")
    files.download(output_data_json)
    files.download(output_matrix_json)
    print("Download complete!")


Calculating cosine similarity matrix...
Similarity matrix calculated! Shape: (250, 250)

 K-Drama data saved as: kdrama_data_QWEN_V3.json
Similarity matrix saved as: kdrama_similarity_matrix_QWEN_V3.json

Downloading files to your local computer...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download complete!
