<a href="https://colab.research.google.com/github/nv-hiep/Hoola_RecommendationSystem/blob/main/Recommendation_System_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A recommendation system with Cosine Similarity Matrix

# 1. Libraries

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= '0.20'

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= '2.0'

if not tf.config.list_physical_devices('GPU'):
    print('No GPU was detected. LSTMs and CNNs can be very slow without a GPU.')
    if IS_COLAB:
        print('Go to Runtime > Change runtime and select a GPU hardware accelerator.')

# Common imports
import os
import shutil
import itertools
import glob
import numpy as np
import matplotlib.image as mpimg
import pandas as pd

import string
import pickle

from tqdm import tqdm
from time import time
from PIL import Image


# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
# mpl.rc('axes', labelsize=14)
# mpl.rc('xtick', labelsize=12)
# mpl.rc('ytick', labelsize=12)

In [None]:
# For text mining
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel

# 2. Connect to Google Drive

In [None]:
# Check if NVIDIA GPU is enabled
!nvidia-smi

Wed Aug 25 18:17:27 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8    28W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')
# !ls

In [None]:
# %cd /content/gdrive/My Drive/
# !ls

In [None]:
! pwd

/content


## 3. Paths to Data Directories

In [None]:
DATA_PATH = '/content/sample_data/'

# 4. Read data

In [None]:
data_file = os.path.join(DATA_PATH, "courses_cleaned_data.csv")
df = pd.read_csv(data_file)
df.head(4)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,duration,duration_unit,published_timestamp,subject,year,cleaned_title
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,90,mins,2017-01-18T20:58:58Z,Business Finance,2017,ultimate investment banking course
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,2340,mins,2017-03-09T16:34:20Z,Business Finance,2017,complete gst course certification grow ca p...
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,150,mins,2016-12-19T19:26:30Z,Business Finance,2016,financial modeling business analysts consult...
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,180,mins,2017-05-30T20:07:24Z,Business Finance,2017,beginner pro financial analysis excel


In [None]:
df.columns

Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level', 'duration',
       'duration_unit', 'published_timestamp', 'subject', 'year',
       'cleaned_title'],
      dtype='object')

In [None]:
df.shape

(3682, 15)

# 5. Text cosine similarity

In [None]:
# Vectorize the titles
count_vect = CountVectorizer()
count_vect_matrix = count_vect.fit_transform(df['cleaned_title'])

In [None]:
count_vect_matrix.shape

(3682, 3556)

In [None]:
count_vect_matrix

<3682x3556 sparse matrix of type '<class 'numpy.int64'>'
	with 18578 stored elements in Compressed Sparse Row format>

In [None]:
# Calc. Cosine Similarity Matrix
cosine_sim_matrix = cosine_similarity(count_vect_matrix)
print("Shape of cosine_sim_matrix: ", cosine_sim_matrix.shape)
cosine_sim_matrix

Shape of cosine_sim_matrix:  (3682, 3682)


array([[1.        , 0.18898224, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.18898224, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.37796447],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.37796447, 0.        ,
        1.        ]])

# 6. Case of the first course (row_id = 1)

In [None]:
# Check the Cosine Similarity Score of the first title
row_id = 0
df.iloc[row_id]

course_id                                                        1070968
course_title                          Ultimate Investment Banking Course
url                    https://www.udemy.com/ultimate-investment-bank...
is_paid                                                             True
price                                                                200
num_subscribers                                                     2147
num_reviews                                                           23
num_lectures                                                          51
level                                                         All Levels
duration                                                              90
duration_unit                                                       mins
published_timestamp                                 2017-01-18T20:58:58Z
subject                                                 Business Finance
year                                               

In [None]:
df.head(1)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,duration,duration_unit,published_timestamp,subject,year,cleaned_title
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,90,mins,2017-01-18T20:58:58Z,Business Finance,2017,ultimate investment banking course


In [None]:
cosine_similarity_scores = cosine_sim_matrix[row_id]
cosine_similarity_scores

array([1.        , 0.18898224, 0.        , ..., 0.        , 0.        ,
       0.        ])

The first element of cosine_sim_matrix[row_id = 0] is 1, because the 1st title is being compared with itself.

In [None]:
# To list and add the indices
cosine_similarity_scores = list(enumerate(cosine_similarity_scores))
cosine_similarity_scores[:5]

[(0, 1.0), (1, 0.1889822365046136), (2, 0.0), (3, 0.0), (4, 0.0)]

In [None]:
# Sort the list and find the most similar courses
sorted_scores = sorted(cosine_similarity_scores, key=lambda x:x[1], reverse=True)

# Remove the first element, because it is being compared to itself, -> score = 1
sorted_scores.pop(0)
sorted_scores[:10]

[(39, 0.75),
 (3478, 0.5773502691896258),
 (242, 0.5),
 (419, 0.5),
 (2718, 0.5),
 (2806, 0.5),
 (659, 0.4472135954999579),
 (1072, 0.4472135954999579),
 (1214, 0.4472135954999579),
 (2647, 0.4472135954999579)]

In [None]:
# Courses Indices (in the cosine_similarity_scores)
other_course_indices = [x[0] for x in sorted_scores]

print("Title of the 1st course: ", df['course_title'][row_id])
print()
print("Total number of course titles N = ", df.shape[0])
print("Length of the other_course_indices = (N - 1): ", len(other_course_indices))

print()
print("Id of 05 most similar courses: ", other_course_indices[:5])

print()
print("Titles of the 05 most similar courses: ")
print(df['course_title'].iloc[other_course_indices[:5]])

Title of the 1st course:  Ultimate Investment Banking Course

Total number of course titles N =  3682
Length of the other_course_indices = (N - 1):  3681

Id of 05 most similar courses:  [39, 3478, 242, 419, 2718]

Titles of the 05 most similar courses: 
39      The Complete Investment Banking Course 2017
3478                     The Ultimate jQuery Course
242      Advanced Accounting for Investment Banking
419       The Investment Banking Recruitment Series
2718            The Ultimate Web Development Course
Name: course_title, dtype: object


In [None]:
recommended_courses = df['course_title'].iloc[other_course_indices]
recommended_course_scores = [x[1] for x in sorted_scores]

In [None]:
recommendation_df = pd.DataFrame(
    {
        "course_title" : recommended_courses,
        "cosine_similarity_scores" : recommended_course_scores
     }
     )
recommendation_df

Unnamed: 0,course_title,cosine_similarity_scores
39,The Complete Investment Banking Course 2017,0.75000
3478,The Ultimate jQuery Course,0.57735
242,Advanced Accounting for Investment Banking,0.50000
419,The Investment Banking Recruitment Series,0.50000
2718,The Ultimate Web Development Course,0.50000
...,...,...
3677,Learn jQuery from Scratch - Master of JavaScri...,0.00000
3678,How To Design A WordPress Website With No Codi...,0.00000
3679,Learn and Build using Polymer,0.00000
3680,CSS Animations: Create Amazing Effects on Your...,0.00000


In [None]:
# Number of course for recommendation
n_recommendations = 10

In [None]:
recommendation_df.head(n_recommendations)

Unnamed: 0,course_title,cosine_similarity_scores
39,The Complete Investment Banking Course 2017,0.75
3478,The Ultimate jQuery Course,0.57735
242,Advanced Accounting for Investment Banking,0.5
419,The Investment Banking Recruitment Series,0.5
2718,The Ultimate Web Development Course,0.5
2806,Ultimate WordPress Plugin Course,0.5
659,Financial Accounting - The Ultimate Beginner C...,0.447214
1072,Managerial Accounting - The Ultimate Beginner ...,0.447214
1214,The Ultimate Drawing Course - Beginner to Adva...,0.447214
2647,The Ultimate Vue JS 2 Developers Course,0.447214


# 7. Put all together

In [None]:
def recommend_courses(df, title, cosine_similarity_scores, n_recommendations = 10, verbose = False):
  """
  Function for find the recommended courses
  """

  # Title <-> Index
  title_indx_df = pd.Series(df.index, index=df['course_title']).drop_duplicates()

  # Index of the title
  id = course_title_indx[title]

  # Details of the course in consideration
  if verbose :
    print("Details of the course (id = ", id, "):")
    print(df.iloc[id])
    print()

  # Cosine_similarity_scores
  cosine_similarity_scores = cosine_sim_matrix[id]
  scores = list(enumerate(cosine_similarity_scores))

  # Sort the similarity Scores
  sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)

  # Remove the first element, because it is being compared to itself, -> score = 1
  sorted_scores.pop(0)
  
  # Find n_recommendations
  rec_course_idx = [x[0] for x in sorted_scores]
  rec_course_scores = [x[1] for x in sorted_scores]
  recommended_courses = df['course_title'].iloc[rec_course_idx]

  recommendation_df = pd.DataFrame({
      "course_title" : recommended_courses,
      "cosine_similarity_scores" : rec_course_scores
  })

  return recommendation_df.head(n_recommendations)

In [None]:
recommend_courses(df, course_title, cosine_similarity_scores, n_recommendations = 10, verbose = True)

Details of the course (id =  3677 ):
course_id                                                         775618
course_title           Learn jQuery from Scratch - Master of JavaScri...
url                    https://www.udemy.com/easy-jquery-for-beginner...
is_paid                                                             True
price                                                                100
num_subscribers                                                     1040
num_reviews                                                           14
num_lectures                                                          21
level                                                         All Levels
duration                                                             120
duration_unit                                                       mins
published_timestamp                                 2016-06-14T17:36:46Z
subject                                                  Web Development
year          

Unnamed: 0,course_title,cosine_similarity_scores
2623,Learn Javascript & JQuery From Scratch,0.816497
2774,Learn JavaScript from scratch,0.707107
2630,JavaScript For Beginners : Learn JavaScript Fr...,0.617213
3495,JQuery Basics - Learn JQuery From Scratch,0.617213
3001,Learn Jquery from Scratch 2-Hour Training,0.547723
3533,Master Riot: Learn Riot.js from Scratch,0.547723
2824,Learn React : The World's Most Lucrative JavaS...,0.5
3381,Master EmberJS : Learn Ember JS From Scratch,0.5
615,Learn Bookkeeping From Scratch,0.471405
2494,Learn JavaScript for beginners,0.471405
