In [None]:
from google.colab import drive
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

# --- 0. Mount Google Drive (if you haven't already) ---
print("Mounting Google Drive...")
try:
    drive.mount('/content/drive')
    print("Google Drive mounted.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    print("Proceeding without Drive. Ensure your CSV is uploaded to Colab session storage if you are not using Drive.")


# Define your project folder in Google Drive or use default Colab path
project_folder_path = '/content/drive/My Drive/UdemyProject/' # <--- ADJUST THIS PATH
# If not using Google Drive, uncomment the line below and comment the one above:
# project_folder_path = '.'


# Create the folder if it doesn't exist (only relevant for Drive path)
if project_folder_path.startswith('/content/drive/'):
    os.makedirs(project_folder_path, exist_ok=True)
    print(f"Ensured project folder exists at: {project_folder_path}")


# --- 1. Load the dataset ---
csv_file_path = os.path.join(project_folder_path, 'udemy_course_data.csv')

try:
    df = pd.read_csv(csv_file_path)
    print("Dataset 'udemy_course_data.csv' loaded successfully.")
except FileNotFoundError:
    print(f"Error: '{csv_file_path}' not found.")
    print("Please upload 'udemy_course_data.csv' to your specified path and try again.")
    exit() # Stop execution if the main data file isn't found

# --- 2. NLP for Content Understanding (Generate/Load Embeddings and SAVE THE MODEL) ---
content_df = df[['course_id', 'course_title', 'subject', 'level']].copy()
content_df['cleaned_title'] = content_df['course_title'].str.lower().str.replace('[^a-zA-Z0-9\s]', '', regex=True)

embeddings_file_path = os.path.join(project_folder_path, 'course_title_embeddings.npy')
model_save_path = os.path.join(project_folder_path, 'sentence_transformer_model') # New path for saving the model

# Load SentenceTransformer model once globally
print("Loading Sentence-BERT model...")
if os.path.exists(model_save_path):
    # Load from saved path if it exists
    embedding_model = SentenceTransformer(model_save_path)
    print(f"Sentence-BERT model loaded from '{model_save_path}'.")
else:
    # Otherwise, download and then save
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    print("Sentence-BERT model downloaded.")
    embedding_model.save(model_save_path) # <<<--- THIS IS THE LINE TO SAVE THE MODEL
    print(f"Sentence-BERT model saved to '{model_save_path}'.")


# Check if embeddings already exist to avoid re-generating
if os.path.exists(embeddings_file_path):
    course_title_embeddings = np.load(embeddings_file_path)
    print("Loaded existing course title embeddings from .npy file.")
else:
    print("Generating course title embeddings... This might take a few minutes depending on dataset size.")
    course_title_embeddings = embedding_model.encode(content_df['cleaned_title'].tolist(), show_progress_bar=True)
    np.save(embeddings_file_path, course_title_embeddings)
    print(f"Embeddings generation complete and saved to '{embeddings_file_path}'.")

# --- 3. Define the Recommendation Function from Text Query ---
def get_recommendations_from_text_query(user_query, num_recommendations=5):
    """
    Recommends courses based on a user's text input query,
    including detailed course information.

    Args:
        user_query (str): The text input from the user (e.g., "I want to learn web development").
        num_recommendations (int): The number of top similar courses to recommend.

    Returns:
        list: A list of dictionaries, each containing detailed information of a recommended course
              and its similarity score.
    """
    cleaned_query = user_query.lower().replace('[^a-zA-Z0-9\s]', '')
    query_embedding = embedding_model.encode([cleaned_query])[0].reshape(1, -1)

    similarities = cosine_similarity(query_embedding, course_title_embeddings).flatten()
    similar_indices = similarities.argsort()[::-1]

    recommended_indices = []
    for i in similar_indices:
        recommended_indices.append(i)
        if len(recommended_indices) == num_recommendations:
            break

    recommendations = []
    for rec_idx in recommended_indices:
        rec_course = df.iloc[rec_idx]
        recommendations.append({
            'course_id': int(rec_course['course_id']),
            'course_title': rec_course['course_title'],
            'url': rec_course['url'],
            'is_paid': bool(rec_course['is_paid']),
            'price': float(rec_course['price']),
            'content_duration': rec_course['content_duration'],
            'level': rec_course['level'],
            'subject': rec_course['subject'],
            'similarity_score': float(similarities[rec_idx])
        })

    return recommendations

# --- 4. Example Usage: Get user input and provide recommendations ---
print("\n--- Personalized Course Recommendations (with full details) ---")
user_input = input("What do you want to learn about today? (e.g., 'Python programming for beginners', 'Financial analysis', 'Web design with HTML CSS'):\n")

recommended_courses = get_recommendations_from_text_query(user_input, num_recommendations=5)

if recommended_courses:
    print(f"\nHere are 5 courses recommended for you based on '{user_input}':")
    for i, rec in enumerate(recommended_courses):
        print(f"\n{i+1}. **'{rec['course_title']}'**")
        print(f"   - **URL:** {rec['url']}")
        print(f"   - **Paid:** {rec['is_paid']}")
        print(f"   - **Price:** ${rec['price']}")
        print(f"   - **Duration:** {rec['content_duration']}")
        print(f"   - **Level:** {rec['level']}")
        print(f"   - **Subject:** {rec['subject']}")
        print(f"   - **Similarity:** {rec['similarity_score']:.3f}")
else:
    print("Sorry, no recommendations could be found for your query. Try a different input!")

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted.
Ensured project folder exists at: /content/drive/My Drive/UdemyProject/
Dataset 'udemy_course_data.csv' loaded successfully.
Loading Sentence-BERT model...
Sentence-BERT model loaded from '/content/drive/My Drive/UdemyProject/sentence_transformer_model'.
Loaded existing course title embeddings from .npy file.

--- Personalized Course Recommendations (with full details) ---
What do you want to learn about today? (e.g., 'Python programming for beginners', 'Financial analysis', 'Web design with HTML CSS'):
Data scientist


  return forward_call(*args, **kwargs)



Here are 5 courses recommended for you based on 'Data scientist':

1. **'Financial Analysis, from Scratch to Professional!'**
   - **URL:** https://www.udemy.com/financial-analysis-from-scratch-to-professional-level/
   - **Paid:** True
   - **Price:** $65.0
   - **Duration:** 3.5 hours
   - **Level:** Beginner Level
   - **Subject:** Business Finance
   - **Similarity:** 0.399

2. **'Visualizing Data'**
   - **URL:** https://www.udemy.com/visualizing-data/
   - **Paid:** True
   - **Price:** $40.0
   - **Duration:** 6.5 hours
   - **Level:** Intermediate Level
   - **Subject:** Business Finance
   - **Similarity:** 0.393

3. **'Introduction To Data Analytics Using Microsoft Power BI'**
   - **URL:** https://www.udemy.com/data-analytics-powerbi/
   - **Paid:** True
   - **Price:** $45.0
   - **Duration:** 1.5 hours
   - **Level:** All Levels
   - **Subject:** Web Development
   - **Similarity:** 0.389

4. **'Charting for Beginners 101: Technical Analysis Demystified'**
   - **URL:** h

In [None]:
import os
import shutil # For zipping files

# IMPORTANT: Make sure this path matches where your model is saved
# This is the path to the folder containing 'sentence_transformer_model' and 'course_title_embeddings.npy'
project_folder_path = '/content/drive/My Drive/UdemyProject/' # <--- ADJUST THIS PATH if needed

# Define the source folder to be zipped
model_folder_to_zip = os.path.join(project_folder_path, 'sentence_transformer_model')

# Define the output path for the zip file (without .zip extension, shutil adds it)
output_zip_path = os.path.join(project_folder_path, 'sentence_transformer_model_archive')

print(f"Zipping '{model_folder_to_zip}' to '{output_zip_path}.zip'...")

try:
    # Create the zip archive
    shutil.make_archive(output_zip_path, 'zip', model_folder_to_zip)
    print("Zipping complete.")

    # If you want to download immediately via Colab's download function (only works from /content/)
    # If your zip is in Drive, it's already syncing.
    if project_folder_path.startswith('/content/drive/'):
        print(f"The zip file is saved to your Google Drive at: {output_zip_path}.zip")
        print("You can download it directly from your Google Drive web interface.")
    else:
        # For files in Colab session storage, use files.download
        from google.colab import files
        files.download(f"{output_zip_path}.zip")
        print(f"Downloading '{output_zip_path}.zip' to your local machine.")

except Exception as e:
    print(f"An error occurred during zipping or downloading: {e}")
    print("Please ensure the 'model_folder_to_zip' path is correct and the folder exists.")

Zipping '/content/drive/My Drive/UdemyProject/sentence_transformer_model' to '/content/drive/My Drive/UdemyProject/sentence_transformer_model_archive.zip'...
Zipping complete.
The zip file is saved to your Google Drive at: /content/drive/My Drive/UdemyProject/sentence_transformer_model_archive.zip
You can download it directly from your Google Drive web interface.
