<a href="https://colab.research.google.com/github/mollard05/kdrama-ai/blob/main/kdrama_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [220]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [221]:
# Load the dataset

try:
    df = pd.read_csv('kdrama.csv')
except FileNotFoundError:
    print("Error: The CSV file was not found. Please upload your file and rename it, or update the file path.")
    # Creating a small dummy DataFrame for demonstration if the file is missing
    df = pd.DataFrame({
        'Title': ['Crash Landing on You', 'Goblin', 'Itaewon Class', 'Vincenzo', 'Hospital Playlist'],
        'Genre': ['Romance, Comedy', 'Fantasy, Romance', 'Business, Youth', 'Crime, Black Comedy', 'Life, Medical'],
        'Cast': ['Hyun Bin, Son Ye Jin', 'Gong Yoo, Kim Go Eun', 'Park Seo Joon, Kim Da Mi', 'Song Joong Ki, Jeon Yeo Been', 'Jo Jung Suk, Yoo Yeon Seok']
    })
    print("Using dummy data for demonstration.")

# Display the first few rows and check for necessary columns (Title, Genre, Cast, etc.)
df.head()

Unnamed: 0,Name,Aired Date,Year of release,Original Network,Aired On,Number of Episodes,Duration,Content Rating,Rating,Synopsis,Genre,Tags,Director,Screenwriter,Cast,Production companies,Rank
0,Move to Heaven,"May 14, 2021",2021,Netflix,Friday,10,52 min.,18+ Restricted (violence & profanity),9.2,Geu Roo is a young autistic man. He works for ...,"Life, Drama, Family","Autism, Uncle-Nephew Relationship, Death, Sava...",Kim Sung Ho,Yoon Ji Ryun,"Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Ju...","Page One Film, Number Three Pictures",#1
1,Flower of Evil,"Jul 29, 2020 - Sep 23, 2020",2020,tvN,"Wednesday, Thursday",16,1 hr. 10 min.,15+ - Teens 15 or older,9.1,Although Baek Hee Sung is hiding a dark secret...,"Thriller, Romance, Crime, Melodrama","Married Couple, Deception, Suspense, Family Se...","Kim Chul Gyu, Yoon Jong Ho",Yoo Jung Hee,"Lee Joon Gi, Moon Chae Won, Jang Hee Jin, Seo ...",Monster Union,#2
2,Hospital Playlist,"Mar 12, 2020 - May 28, 2020",2020,"Netflix, tvN",Thursday,12,1 hr. 30 min.,15+ - Teens 15 or older,9.1,The stories of people going through their days...,"Friendship, Romance, Life, Medical","Strong Friendship, Multiple Mains, Best Friend...",Shin Won Ho,Lee Woo Jung,"Jo Jung Suk, Yoo Yeon Seok, Jung Kyung Ho, Kim...","Egg Is Coming, CJ ENM",#3
3,Hospital Playlist 2,"Jun 17, 2021 - Sep 16, 2021",2021,"Netflix, tvN",Thursday,12,1 hr. 40 min.,15+ - Teens 15 or older,9.1,Everyday is extraordinary for five doctors and...,"Friendship, Romance, Life, Medical","Workplace, Strong Friendship, Best Friends, Mu...",Shin Won Ho,Lee Woo Jung,"Jo Jung Suk, Yoo Yeon Seok, Jung Kyung Ho, Kim...","Egg Is Coming, CJ ENM",#4
4,My Mister,"Mar 21, 2018 - May 17, 2018",2018,tvN,"Wednesday, Thursday",16,1 hr. 17 min.,15+ - Teens 15 or older,9.1,Park Dong Hoon is a middle-aged engineer who i...,"Psychological, Life, Drama, Family","Age Gap, Nice Male Lead, Strong Female Lead, H...","Kim Won Suk, Kim Sang Woo",Park Hae Young,"Lee Sun Kyun, IU, Park Ho San, Song Sae Byuk, ...",Chorokbaem Media,#5


In [222]:
# Drop any dramas with missing data in key columns
# print(df.columns)
df.dropna(subset=['Name', 'Genre', 'Tags','Synopsis','Cast'], inplace=True)
df.reset_index(drop=True, inplace=True)
# Function to combine relevant features into a single string
def combine_features(row):
    # Standardize the text and separate words by spaces (important for vectorization)
    genre = str(row['Genre']).replace(',', ' ').lower()
    tags = str(row['Tags']).replace(',', ' ').lower()
    cast = str(row['Cast']).replace(',', ' ').lower()
    synopsis = str(row['Synopsis']).replace(',', ' ').lower()
    # Add other fields like 'Synopsis' or 'Tags' if your dataset has them

    return f"{genre} {tags} {cast} {synopsis}"

# Apply the function to create the 'features' column
df['features'] = df.apply(combine_features, axis=1)

df[['Name', 'features']].head()

Unnamed: 0,Name,features
0,Move to Heaven,life drama family autism uncle-nephew re...
1,Flower of Evil,thriller romance crime melodrama marrie...
2,Hospital Playlist,friendship romance life medical strong ...
3,Hospital Playlist 2,friendship romance life medical workpla...
4,My Mister,psychological life drama family age gap...


In [223]:
# Initialize the TF-IDF Vectorizer
# stop_words='english' removes common English words like 'the', 'a', etc.
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['features'])

print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")
# The shape is (Number of Dramas, Number of Unique Features)

TF-IDF Matrix Shape: (250, 4181)


In [224]:
# Compute the Cosine Similarity matrix
# Cosine Similarity measures the angle between two vectors (dramas).
# A smaller angle (closer to 1) means more similar.
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(f"Cosine Similarity Matrix Shape: {cosine_sim.shape}")

Cosine Similarity Matrix Shape: (250, 250)


In [225]:
name_to_index = pd.Series(df.index, index=df['Name']).to_dict()

def get_multi_recommendations(watched_dramas_list, df=df, tfidf_matrix=tfidf_matrix, name_to_index=name_to_index, top_n=10):

    # 1. Get the indices of the watched dramas
    watched_indices = []
    found_titles = []

    for name in watched_dramas_list:
        if name in name_to_index:
            watched_indices.append(name_to_index[name])
            found_titles.append(name)
        else:
            print(f"Warning: '{name}' not found in the database. Skipping.")

    if not watched_indices:
        return "Error: None of the watched dramas were found in the database. Please check the spelling."

    # 2. Extract the TF-IDF vectors for all watched dramas
    # The tfidf_matrix is a sparse matrix, which is memory efficient.
    watched_vectors = tfidf_matrix[watched_indices]

    # 3. Calculate the "Super-Vector" by averaging the vectors
    # .mean(axis=0) averages the rows (dramas) to create one vector
    # .flatten() converts the result into a simple 1D array
    preference_vector = np.asarray(watched_vectors.mean(axis=0)).flatten()

    # 4. Calculate the Cosine Similarity between the Super-Vector and ALL dramas
    # Reshape preference_vector to (1, -1) to match the expected input format for cosine_similarity
    preference_vector_reshaped = preference_vector.reshape(1, -1)

    # Calculate similarity between the preference vector and all drama vectors
    # Note: We use tfidf_matrix here, not the pre-calculated cosine_sim matrix
    similarity_scores = cosine_similarity(preference_vector_reshaped, tfidf_matrix).flatten()

    # 5. Get the top N similar dramas (and their indices)
    # Enumerate scores to keep track of original index
    sim_scores = list(enumerate(similarity_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 6. Filter out the dramas the user has ALREADY watched
    recommended_dramas = []
    count = 0

    for i, score in sim_scores:
        drama_title = df['Name'].iloc[i]

        # Check if the drama is NOT in the list of watched titles
        if drama_title not in found_titles:
            recommended_dramas.append((drama_title, score))
            count += 1
            if count >= top_n:
                break

    # Return just the titles
    return [title for title, score in recommended_dramas]

In [226]:
# --- YOUR INPUT HERE ---
#from all those i watched
# my_watched_list = [
#     'Crash Landing on You',
#     'Itaewon Class',
#     'Reply 1988',
#     "It's Okay to Not Be Okay",
#     'Happiness',
#     'Tomorrow',
#     'The Uncanny Counter',
#     'Weightlifting Fairy Kim Bok Joo',
#     'Hometown Cha-Cha-Cha',
#     'Our Beloved Summer',
#     'Our Blues',
#     'A Business Proposal',
#     'Strong Woman Do Bong Soon',
#     'My Name',
#     'Hotel del Luna',
#     'Descendants of the Sun',
#     'Law School',
#     'My Liberation Notes',
#     'Little Women',
#     'Big Mouth',
#     'Just Between Lovers',
#     '365',
#     'Squid Game',
#     'Dali and the Cocky Prince',
#     'Work Later, Drink Now',
#     'Extracurricular',
#     "Yumi's Cells",
#     'From Now On, Showtime!',
#     'Extraordinary You',
#     'Pinocchio',
#     'Love All Play',
#     'Player',
#     'Rookie Cops',
#     'The Sound of Your Heart',
#     'Run On',
#     'Angry Mom',
#     'Suspicious Partner',
#     'Crazy Love'
# ]
#from those i really enjoyed
my_watched_list = [
    'My Name',
    'Strong Woman Do Bong Soon',
    'A Business Proposal',
    'Big Mouth',
    'Hometown Cha-Cha-Cha',
    'Reply 1988',
    'The Uncanny Counter',
    'Run On',
    'Crazy Love',
    'My Liberation Notes'
]
# -----------------------

recommendations = get_multi_recommendations(my_watched_list, top_n=10)

print(f"Because you watched '{my_watched_list}', you might enjoy the following:")
# print("------------------------------------------------------------------------")
# for i, drama in enumerate(recommendations):
#     print(f"{i+1}. {drama}")

Because you watched '['My Name', 'Strong Woman Do Bong Soon', 'A Business Proposal', 'Big Mouth', 'Hometown Cha-Cha-Cha', 'Reply 1988', 'The Uncanny Counter', 'Run On', 'Crazy Love', 'My Liberation Notes']', you might enjoy the following:


In [227]:
# 1. Re-calculate the preference vector from your watched list
# This part is similar to the beginning of get_multi_recommendations
watched_indices = []
for name in my_watched_list:
    if name in name_to_index:
        watched_indices.append(name_to_index[name])

if not watched_indices:
    print("Error: None of the watched dramas were found in the database. Cannot calculate similarity.")
else:
    watched_vectors = tfidf_matrix[watched_indices]
    preference_vector = np.asarray(watched_vectors.mean(axis=0)).flatten()
    preference_vector_reshaped = preference_vector.reshape(1, -1)

    print("Similarity scores for recommended dramas with your watched list:")
    print("------------------------------------------------------------------------")
    for drama_title in recommendations:
        # Get the index of the recommended drama
        if drama_title in name_to_index:
            drama_index = name_to_index[drama_title]
            drama_vector = tfidf_matrix[drama_index]

            # Calculate cosine similarity
            # Reshape drama_vector to (1, -1) if it's not already
            drama_vector_reshaped = drama_vector.reshape(1, -1)
            similarity = cosine_similarity(preference_vector_reshaped, drama_vector_reshaped)[0][0]
            print(f"- {drama_title}: {similarity:.4f}")
        else:
            print(f"Warning: '{drama_title}' not found in the database. Skipping similarity calculation.")

Similarity scores for recommended dramas with your watched list:
------------------------------------------------------------------------
- Cruel City: 0.2566
- Dr. Romantic: 0.2438
- What's Wrong with Secretary Kim: 0.2420
- Seasons of Blossom: 0.2366
- Itaewon Class: 0.2334
- Fight For My Way: 0.2327
- My Mister: 0.2307
- Be Melodramatic: 0.2279
- Hospital Playlist: 0.2259
- Ugly Alert: 0.2249
