# AI/Machine Learning Intern Challenge: Simple Content-Based Recommendation

In [1]:
#imports
import pandas as pd
!pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer



## Load the Dataset
Download the data(csv) from Kaggle from the link:
https://www.kaggle.com/datasets/rajugc/imdb-top-250-movies-dataset

Store the pathname of the csv file in the variable `pathname`

In [2]:
pathname = "/Users/mythrikishore/lumaa-spring-2025-ai-ml/IMDB_Top_250_Movies.csv"

In [3]:
df = pd.read_csv(pathname)
# Keep only 'name' and 'tagline' columns
df = df[['name', 'tagline']].dropna()
df.head()

Unnamed: 0,name,tagline
0,The Shawshank Redemption,Fear can hold you prisoner. Hope can set you f...
1,The Godfather,An offer you can't refuse.
2,The Dark Knight,Why So Serious?
3,The Godfather Part II,All the power on earth can't change destiny.
4,12 Angry Men,Life Is In Their Hands -- Death Is On Their Mi...


## Preprocess the text
Combine the movie name and the tagline to calculate the similarity.

In [5]:
df['text'] = df['name'] + " " + df['tagline']
#ignore "unexpected error" if it appears

## Convert text to TF-IDF vectors

In [6]:
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True) #focus on meaningful words and ensure consistency
tfidf_matrix = vectorizer.fit_transform(df['text'])

## Calculate Similarity & Provide Recommendations

In [7]:
def recommend_movies(user_input, top_n=5):
    # Transform the input using the vectorizer from the previous cell
    input_vec = vectorizer.transform([user_input])
    
    # Compute cosine similarity
    similarity_scores = cosine_similarity(input_vec, tfidf_matrix).flatten()
    
    # Get top N (default=5) indices
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    
    # Return the top N movie names with similarity scores as a list
    recommendations =  [(df.iloc[i]['name'], float(round(similarity_scores[i], 3))) for i in top_indices]
    return recommendations

In [8]:
#Example usage
#`user_text` represents the input description - change according to recommendation preferences
user_text = "Fun thrilling space movie"
recommend_movies(user_text)

[('The Exorcist', 0.324),
 ('Alien', 0.312),
 ('2001: A Space Odyssey', 0.209),
 ('Finding Nemo', 0.2),
 ('The Apartment', 0.142)]

### Interpretation of Similarity Scores

The similarity score is calculated based on TF-IDF cosine similarity. It ranges from 0 to 1.

A similarity score close to 1 signifies that the two text inputs (user's and movie tagline) are very **similar**. On the other hand, a score close to 0 signifies that the two text inputs are very **dissimilar**.