In [1]:
import pandas as pd
import numpy as np
import neattext.functions as nfx
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/udemy_course_data.csv')
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017,5,30
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13


# Remove stop words and special characters

In [3]:
df['Clean_Title'] = df['course_title'].apply(nfx.remove_stopwords)
df['Clean_Title'] = df['course_title'].apply(nfx.remove_special_characters)
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,Clean_Title
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18,Ultimate Investment Banking Course
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9,Complete GST Course Certification Grow Your ...
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19,Financial Modeling for Business Analysts and C...
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017,5,30,Beginner to Pro Financial Analysis in Excel 2017
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13,How To Maximize Your Profits Trading Options


# Vectorise the clean title

In [4]:
countVect = CountVectorizer()
cvmat = countVect.fit_transform(df['Clean_Title'])
cvmat

<3683x3680 sparse matrix of type '<class 'numpy.int64'>'
	with 23448 stored elements in Compressed Sparse Row format>

# Cosin Similarity

In [5]:
cos_sim = cosine_similarity(cvmat)
cos_sim.shape

(3683, 3683)

In [6]:
cos_sim.shape

(3683, 3683)

# Recommand a Course

In [11]:
exampleTitle = 'Visualizing Data'

In [12]:
course_index = pd.Series(df.index, index=df['Clean_Title']).drop_duplicates()
index = course_index[exampleTitle]

# index, cos_sim
scores = list(enumerate(cos_sim[index]))
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True) # first item the course itself
recommanded_course_index = [x[0] for x in sorted_scores[1:]]
recommanded_course_score = [x[1] for x in sorted_scores[1:]]

recommanded_df = df.loc[recommanded_course_index]
recommanded_df['Similarity_Score'] = recommanded_course_score

In [9]:
df.to_csv('..\data\clean_title_udemy_course_data.csv', index=False)