In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
import re
import neattext.functions as nfx

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
comments_data = pd.read_csv('./Comments.csv')
courses_data = pd.read_csv('./Course_info.csv')

courses_data = courses_data.head(len(courses_data)//4)
comments_data=comments_data.head(len(comments_data)//4)

print('Courses Data Shape is', courses_data.shape)
print('Comments Data Shape is', comments_data.shape)

Courses Data Shape is (52433, 20)
Comments Data Shape is (2352931, 6)


In [None]:
# Check data reliability
courses_data['title'].value_counts()
courses_data['headline'].value_counts()

headline
Treinamento Mental com Tecnologia de Ondas Cerebrais - Brainwave                                                            27
Study Guide for the California Police Academy (P.O.S.T.) Written Examination                                                23
Treinamento Mental - Tecnologia de Ondas Cerebrais - Brainwave                                                              18
Mejora tu salud, belleza y personalidad acorde con tu signo zodiacal.                                                       12
Practice Test with Explanations  -Reading Skills, Examination Skills, Tips and Guidance                                     11
                                                                                                                            ..
Crea estrategias en Pinescript  de la plataforma Trading View                                                                1
Uso de flash em eventos                                                                               

In [None]:
# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    # Remove special characters (keeping only alphanumeric and spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word.lower() not in stop_words)
    return text

courses_data['title_clean'] = courses_data['title'].apply(clean_text)

courses_data['headline'] = courses_data['headline'].astype(str)
courses_data['headline_clean'] = courses_data['headline'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\okafo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
courses_index = pd.Series(courses_data.index, index=courses_data['title']).drop_duplicates()

In [6]:
# Vectorize course titles and headlines
model = SentenceTransformer('all-MiniLM-L6-v2')

title_count_matrix = model.encode(courses_data['title_clean'].tolist())
headline_count_matrix = model.encode(courses_data['headline_clean'].tolist())

In [18]:
# Assign weights to title and headline similarities to give title more importance
def search_courses(query, model, top_n=5, title_weight=0.7, headline_weight=0.3):
    query_vector = model.encode([query])

    title_sim_scores = cosine_similarity(query_vector, title_count_matrix)[0]
    headline_sim_scores = cosine_similarity(query_vector, headline_count_matrix)[0]

    # Combine similarities of title and headline
    combined_scores = (title_weight * title_sim_scores) + (headline_weight * headline_sim_scores)

    similar_indices = np.argsort(combined_scores)[::-1][:top_n]

    print(f"Recommendations done for: {query}\n")
    
    result_table = courses_data.iloc[similar_indices][['title', 'headline']]

    return result_table



def search_courses_by_category(query, model, category=None, sub_category=None, top_n=5, title_weight=0.6, headline_weight=0.4):
    query_vector = model.encode([query])

    filtered_courses = courses_data
    if category:
        filtered_courses = filtered_courses[filtered_courses['category'] == category]
    if sub_category:
        filtered_courses = filtered_courses[filtered_courses['subcategory'] == sub_category]

    if filtered_courses.empty:
        return pd.DataFrame(columns=['title', 'headline'])

    title_matrix = model.encode(filtered_courses['title_clean'].tolist())
    headline_matrix = model.encode(filtered_courses['headline_clean'].tolist())

    title_sim = cosine_similarity(query_vector, title_matrix)[0]
    headline_sim = cosine_similarity(query_vector, headline_matrix)[0]

    combined_score = title_weight * title_sim + headline_weight * headline_sim
    top_indices = np.argsort(combined_score)[::-1][:top_n]

    print("Recommendations done for", query)
    if category:
        print("Category:", category)
    if sub_category:
        print("Sub Category:", sub_category)

    return filtered_courses.iloc[top_indices][['title', 'headline', 'category', 'subcategory']]

In [26]:
# Example: Search for courses related to "Gift Making"
search_courses("Gift Making", model, top_n=3)

Recommendations done for: Gift Making



Unnamed: 0,title,headline
8150,Gift Basket Making in 10 Simple Steps,"I'm Denise Riley ""The Gift Basket Lady"". I wil..."
3186,How to Make an Exploding Gift Box,A step by step guide to creating the perfect g...
40688,Design a Gift Box in Fusion 360,Sharpen your Fusion 360 skills while modeling ...


In [27]:
# Example: Search for courses related to "Machine Learning"
search_courses("Machine Learning Basics", model, top_n=3)

Recommendations done for: Machine Learning Basics



Unnamed: 0,title,headline
40678,Machine Learning : A Beginner's Basic Introduc...,Learn Machine Learning Basics with a Practica...
34829,Machine Learning for beginners,Azure Machine Learning
27172,Machine Learning with Python,All about Machine learning


In [38]:
# Example: Search for courses related to "Excel" without category filter
search_courses("Excel for Experts", model, top_n=3)

Recommendations done for: Excel for Experts



Unnamed: 0,title,headline
49440,エクセルで家計簿を作ろう！~初級編~　簡単な操作だけで家計簿が作れちゃう！Excel初心者、...,簡単な関数だけを使ってExcelで家計簿を作成していきましょう！エクセルが初めてという方やこ...
34833,Хитрости работы с EXCEL,Лайфхаки начинающим пользователям EXCEL для бо...
14994,まるで手品！集計に最適！Excelのピボットテーブル活用方法！,Excelで集計するならピボットテーブル！大量のデータも一発で集計！数秒のマウス操作で、管理...


In [37]:
# Example: Search for courses related to "Excel" with category filter
search_courses_by_category("Excel for Experts", model, category='Office Productivity', top_n=3)

Recommendations done for Excel for Experts
Category: Office Productivity


Unnamed: 0,title,headline,category,subcategory
14994,まるで手品！集計に最適！Excelのピボットテーブル活用方法！,Excelで集計するならピボットテーブル！大量のデータも一発で集計！数秒のマウス操作で、管理...,Office Productivity,Microsoft
26225,Excel in Microsoft Excel 2: Intermediate to Ex...,Get Expert certified! Levels 6-10 for Microsof...,Office Productivity,Microsoft
37435,Microsoft Excel - Продвинутый пользователь Excel,"Станьте мастером Excel, изучая продвинутые воз...",Office Productivity,Microsoft
