In [44]:
import numpy as np
import pandas as pd
import neattext.functions as nfx 

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [45]:
df = pd.read_csv('udemy_course_data.csv')
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017,5,30
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13


In [46]:
df['clean_title'] = df['course_title'].apply(nfx.remove_stopwords)
df['clean_title'] = df['clean_title'].apply(nfx.remove_special_characters)

df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,clean_title
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18,Ultimate Investment Banking Course
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9,Complete GST Course Certification Grow Practice
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19,Financial Modeling Business Analysts Consultants
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017,5,30,Beginner Pro Financial Analysis Excel 2017
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13,Maximize Profits Trading Options


Vectorise the Clean Title

In [47]:
countvect = CountVectorizer()
cvmat = countvect.fit_transform(df['clean_title'])
cvmat

<3683x3564 sparse matrix of type '<class 'numpy.int64'>'
	with 18364 stored elements in Compressed Sparse Row format>

Cosine Similary

In [48]:
cossim = cosine_similarity(cvmat)
cossim

array([[1.        , 0.20412415, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.20412415, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.23570226],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.23570226, 0.        ,
        1.        ]])

In [49]:
cossim.shape

(3683, 3683)

Recommend Course

In [50]:
course_index = pd.Series(df.index, index=df['course_title']).drop_duplicates()
course_index

course_title
Ultimate Investment Banking Course                                0
Complete GST Course & Certification - Grow Your CA Practice       1
Financial Modeling for Business Analysts and Consultants          2
Beginner to Pro - Financial Analysis in Excel 2017                3
How To Maximize Your Profits Trading Options                      4
                                                               ... 
Learn jQuery from Scratch - Master of JavaScript library       3678
How To Design A WordPress Website With No Coding At All        3679
Learn and Build using Polymer                                  3680
CSS Animations: Create Amazing Effects on Your Website         3681
Using MODX CMS to Build Websites: A Beginner's Guide           3682
Length: 3683, dtype: int64

In [51]:
test = df[df['course_title'].str.contains('Profit')]
test.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,clean_title
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13,Maximize Profits Trading Options
8,476268,Options Trading 3 : Advanced Stock Profit and ...,https://www.udemy.com/day-trading-stock-option...,True,195,5172,34,38,Expert Level,2.5 hours,2015-05-28T00:14:03Z,Business Finance,1008540,2015-05-28,00:14:03Z,2015,5,28,Options Trading 3 Advanced Stock Profit Succe...
18,606928,7 Deadly Mistakes of Investing that Will Slash...,https://www.udemy.com/7-deadly-mistakes-of-inv...,True,50,5354,24,23,All Levels,1.5 hours,2015-09-21T18:10:34Z,Business Finance,267700,2015-09-21,18:10:34Z,2015,9,21,7 Deadly Mistakes Investing Slash Profits
51,325834,Learn to Trade for Profit: Find and Trade Winn...,https://www.udemy.com/trading-stocks-successfu...,True,95,10605,71,77,All Levels,3 hours,2014-10-24T18:13:49Z,Business Finance,1007475,2014-10-24,18:13:49Z,2014,10,24,Learn Trade Profit Find Trade Winning Stocks
74,285638,Learn to Trade for Profit:Trading with Japanes...,https://www.udemy.com/introduction-to-japanese...,True,60,16385,273,45,Beginner Level,3 hours,2014-09-02T03:53:13Z,Business Finance,983100,2014-09-02,03:53:13Z,2014,9,2,Learn Trade ProfitTrading Japanese Candlesticks


In [52]:
top6 = test.sort_values(by='num_subscribers', ascending=False).head(6)
top6

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,clean_title
74,285638,Learn to Trade for Profit:Trading with Japanes...,https://www.udemy.com/introduction-to-japanese...,True,60,16385,273,45,Beginner Level,3 hours,2014-09-02T03:53:13Z,Business Finance,983100,2014-09-02,03:53:13Z,2014,9,2,Learn Trade ProfitTrading Japanese Candlesticks
51,325834,Learn to Trade for Profit: Find and Trade Winn...,https://www.udemy.com/trading-stocks-successfu...,True,95,10605,71,77,All Levels,3 hours,2014-10-24T18:13:49Z,Business Finance,1007475,2014-10-24,18:13:49Z,2014,10,24,Learn Trade Profit Find Trade Winning Stocks
154,377370,Learn to Trade the Stock Market without Blowin...,https://www.udemy.com/learn-to-trade-the-stock...,True,60,6967,16,34,All Levels,1 hour,2017-05-02T20:53:24Z,Business Finance,418020,2017-05-02,20:53:24Z,2017,5,2,Learn Trade Stock Market Blowing Profits
2689,309370,Bootstrap 3 -> Profitable WordPress Theme Deve...,https://www.udemy.com/bootstrap-wordpress/,True,40,5547,80,40,All Levels,3.5 hours,2014-10-22T17:39:21Z,Web Development,221880,2014-10-22,17:39:21Z,2014,10,22,Bootstrap 3 Profitable WordPress Theme Develo...
18,606928,7 Deadly Mistakes of Investing that Will Slash...,https://www.udemy.com/7-deadly-mistakes-of-inv...,True,50,5354,24,23,All Levels,1.5 hours,2015-09-21T18:10:34Z,Business Finance,267700,2015-09-21,18:10:34Z,2015,9,21,7 Deadly Mistakes Investing Slash Profits
8,476268,Options Trading 3 : Advanced Stock Profit and ...,https://www.udemy.com/day-trading-stock-option...,True,195,5172,34,38,Expert Level,2.5 hours,2015-05-28T00:14:03Z,Business Finance,1008540,2015-05-28,00:14:03Z,2015,5,28,Options Trading 3 Advanced Stock Profit Succe...


In [53]:
index = course_index['Excel functions to analyze and visualize data']

scores = list(enumerate(cossim[index]))
scores

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.18257418583505539),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.18257418583505539),
 (39, 0.0),
 (40, 0.0),
 (41, 0.18257418583505539),
 (42, 0.0),
 (43, 0.0),
 (44, 0.0),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.0),
 (49, 0.0),
 (50, 0.0),
 (51, 0.0),
 (52, 0.0),
 (53, 0.0),
 (54, 0.0),
 (55, 0.0),
 (56, 0.0),
 (57, 0.0),
 (58, 0.0),
 (59, 0.0),
 (60, 0.2981423969999719),
 (61, 0.0),
 (62, 0.0),
 (63, 0.0),
 (64, 0.0),
 (65, 0.1690308509457033),
 (66, 0.0),
 (67, 0.0),
 (68, 0.0),
 (69, 0.0),
 (70, 0.0),
 (71, 0.0),
 (72, 0.0),
 (73, 0.0),
 (74, 0.0),
 (75, 0.0),
 (76, 0.0),
 (77, 0.

In [54]:
sorted_score = sorted(scores,key = lambda x:x[1],reverse=True)
sorted_score

[(131, 0.9999999999999999),
 (265, 0.5163977794943223),
 (1099, 0.4472135954999579),
 (731, 0.36514837167011077),
 (3581, 0.36514837167011077),
 (480, 0.3162277660168379),
 (60, 0.2981423969999719),
 (237, 0.282842712474619),
 (741, 0.25819888974716115),
 (132, 0.22360679774997896),
 (545, 0.22360679774997896),
 (728, 0.22360679774997896),
 (773, 0.22360679774997896),
 (831, 0.22360679774997896),
 (938, 0.22360679774997896),
 (746, 0.19999999999999998),
 (1069, 0.19999999999999998),
 (2537, 0.19999999999999998),
 (2661, 0.19999999999999998),
 (3242, 0.19999999999999998),
 (3, 0.18257418583505539),
 (38, 0.18257418583505539),
 (41, 0.18257418583505539),
 (270, 0.18257418583505539),
 (321, 0.18257418583505539),
 (809, 0.18257418583505539),
 (912, 0.18257418583505539),
 (1084, 0.18257418583505539),
 (2614, 0.18257418583505539),
 (2880, 0.18257418583505539),
 (3277, 0.18257418583505539),
 (3355, 0.18257418583505539),
 (3356, 0.18257418583505539),
 (3426, 0.18257418583505539),
 (65, 0.16903

In [55]:
sorted_indices = [i[0] for i in sorted_score[1:]]
sorted_values = [i[1] for i in sorted_score[1:]]

sorted_values

[0.5163977794943223,
 0.4472135954999579,
 0.36514837167011077,
 0.36514837167011077,
 0.3162277660168379,
 0.2981423969999719,
 0.282842712474619,
 0.25819888974716115,
 0.22360679774997896,
 0.22360679774997896,
 0.22360679774997896,
 0.22360679774997896,
 0.22360679774997896,
 0.22360679774997896,
 0.19999999999999998,
 0.19999999999999998,
 0.19999999999999998,
 0.19999999999999998,
 0.19999999999999998,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505539,
 0.1690308509457033,
 0.1690308509457033,
 0.1690308509457033,
 0.1690308509457033,
 0.1690308509457033,
 0.15811388300841894,
 0.14907119849998596,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,

In [56]:
recommended_df = df.iloc[sorted_indices]
recommended_df

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,clean_title
265,722682,Advanced Excel functions,https://www.udemy.com/advanced-excel-functions...,True,20,1964,47,15,All Levels,1.5 hours,2016-01-12T20:03:45Z,Business Finance,39280,2016-01-12,20:03:45Z,2016,1,12,Advanced Excel functions
1099,1206414,Excelを使ってビジネスシミュレーション：基礎編,https://www.udemy.com/excel-zw/,True,85,22,5,47,Beginner Level,5 hours,2017-05-18T22:31:23Z,Business Finance,1870,2017-05-18,22:31:23Z,2017,5,18,Excel
731,760990,Excel 4 Accounting & Bookkeeping - Master Look...,https://www.udemy.com/excel-4-accountants-book...,True,35,2638,140,12,All Levels,1 hour,2016-02-13T03:19:09Z,Business Finance,92330,2016-02-13,03:19:09Z,2016,2,13,Excel 4 Accounting Bookkeeping Master Lookup...
3581,1250934,Display and analyze GIS data on the web with L...,https://www.udemy.com/display-and-analyze-gis-...,True,100,25,4,66,Intermediate Level,12.5 hours,2017-06-15T21:17:43Z,Web Development,2500,2017-06-15,21:17:43Z,2017,6,15,Display analyze GIS data web Leafletjs
480,892446,Visualizing Data,https://www.udemy.com/visualizing-data/,True,40,149,2,63,Intermediate Level,6.5 hours,2016-07-22T13:09:03Z,Business Finance,5960,2016-07-22,13:09:03Z,2016,7,22,Visualizing Data
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3678,775618,Learn jQuery from Scratch - Master of JavaScri...,https://www.udemy.com/easy-jquery-for-beginner...,True,100,1040,14,21,All Levels,2 hours,2016-06-14T17:36:46Z,Web Development,104000,2016-06-14,17:36:46Z,2016,6,14,Learn jQuery Scratch Master JavaScript library
3679,1088178,How To Design A WordPress Website With No Codi...,https://www.udemy.com/how-to-make-a-wordpress-...,True,25,306,3,42,Beginner Level,3.5 hours,2017-03-10T22:24:30Z,Web Development,7650,2017-03-10,22:24:30Z,2017,3,10,Design WordPress Website Coding
3680,635248,Learn and Build using Polymer,https://www.udemy.com/learn-and-build-using-po...,True,40,513,169,48,All Levels,3.5 hours,2015-12-30T16:41:42Z,Web Development,20520,2015-12-30,16:41:42Z,2015,12,30,Learn Build Polymer
3681,905096,CSS Animations: Create Amazing Effects on Your...,https://www.udemy.com/css-animations-create-am...,True,50,300,31,38,All Levels,3 hours,2016-08-11T19:06:15Z,Web Development,15000,2016-08-11,19:06:15Z,2016,8,11,CSS Animations Create Amazing Effects Website


In [57]:
recommended_df['similarity_score'] = np.array(sorted_values)
recommended_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_df['similarity_score'] = np.array(sorted_values)


Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,clean_title,similarity_score
265,722682,Advanced Excel functions,https://www.udemy.com/advanced-excel-functions...,True,20,1964,47,15,All Levels,1.5 hours,2016-01-12T20:03:45Z,Business Finance,39280,2016-01-12,20:03:45Z,2016,1,12,Advanced Excel functions,0.516398
1099,1206414,Excelを使ってビジネスシミュレーション：基礎編,https://www.udemy.com/excel-zw/,True,85,22,5,47,Beginner Level,5 hours,2017-05-18T22:31:23Z,Business Finance,1870,2017-05-18,22:31:23Z,2017,5,18,Excel,0.447214
731,760990,Excel 4 Accounting & Bookkeeping - Master Look...,https://www.udemy.com/excel-4-accountants-book...,True,35,2638,140,12,All Levels,1 hour,2016-02-13T03:19:09Z,Business Finance,92330,2016-02-13,03:19:09Z,2016,2,13,Excel 4 Accounting Bookkeeping Master Lookup...,0.365148
3581,1250934,Display and analyze GIS data on the web with L...,https://www.udemy.com/display-and-analyze-gis-...,True,100,25,4,66,Intermediate Level,12.5 hours,2017-06-15T21:17:43Z,Web Development,2500,2017-06-15,21:17:43Z,2017,6,15,Display analyze GIS data web Leafletjs,0.365148
480,892446,Visualizing Data,https://www.udemy.com/visualizing-data/,True,40,149,2,63,Intermediate Level,6.5 hours,2016-07-22T13:09:03Z,Business Finance,5960,2016-07-22,13:09:03Z,2016,7,22,Visualizing Data,0.316228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3678,775618,Learn jQuery from Scratch - Master of JavaScri...,https://www.udemy.com/easy-jquery-for-beginner...,True,100,1040,14,21,All Levels,2 hours,2016-06-14T17:36:46Z,Web Development,104000,2016-06-14,17:36:46Z,2016,6,14,Learn jQuery Scratch Master JavaScript library,0.000000
3679,1088178,How To Design A WordPress Website With No Codi...,https://www.udemy.com/how-to-make-a-wordpress-...,True,25,306,3,42,Beginner Level,3.5 hours,2017-03-10T22:24:30Z,Web Development,7650,2017-03-10,22:24:30Z,2017,3,10,Design WordPress Website Coding,0.000000
3680,635248,Learn and Build using Polymer,https://www.udemy.com/learn-and-build-using-po...,True,40,513,169,48,All Levels,3.5 hours,2015-12-30T16:41:42Z,Web Development,20520,2015-12-30,16:41:42Z,2015,12,30,Learn Build Polymer,0.000000
3681,905096,CSS Animations: Create Amazing Effects on Your...,https://www.udemy.com/css-animations-create-am...,True,50,300,31,38,All Levels,3 hours,2016-08-11T19:06:15Z,Web Development,15000,2016-08-11,19:06:15Z,2016,8,11,CSS Animations Create Amazing Effects Website,0.000000


In [58]:
usedf = recommended_df[['clean_title','similarity_score']]
usedf

Unnamed: 0,clean_title,similarity_score
265,Advanced Excel functions,0.516398
1099,Excel,0.447214
731,Excel 4 Accounting Bookkeeping Master Lookup...,0.365148
3581,Display analyze GIS data web Leafletjs,0.365148
480,Visualizing Data,0.316228
...,...,...
3678,Learn jQuery Scratch Master JavaScript library,0.000000
3679,Design WordPress Website Coding,0.000000
3680,Learn Build Polymer,0.000000
3681,CSS Animations Create Amazing Effects Website,0.000000


In [62]:
def recommend_course(title,numrec = 10):
    
    course_index = pd.Series(
        df.index, index=df['course_title']).drop_duplicates()

    index = course_index[title]

    scores = list(enumerate(cossim [index]))

    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

    selected_course_index = [i[0] for i in sorted_scores[1:]]

    selected_course_score = [i[1] for i in sorted_scores[1:]]

    recdf = df.iloc[selected_course_index]

    recdf['similarity_score'] = selected_course_score

    recommends = recdf[[
        'course_title', 'similarity_score', 'url', 'price', 'num_subscribers']]

    return recommends.head(numrec)

rec = recommend_course('Financial Statements Made Easy',20)

rec

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recdf['similarity_score'] = selected_course_score


Unnamed: 0,course_title,similarity_score,url,price,num_subscribers
101,Financial Statements Basics,0.666667,https://www.udemy.com/financial-statements-bas...,20,1374
226,Interpreting Financial Statements,0.666667,https://www.udemy.com/financialstatements/,20,1449
721,Introduction to Financial Statements,0.666667,https://www.udemy.com/introduction-to-financia...,40,112
780,Understanding Financial Statements,0.666667,https://www.udemy.com/understanding-financial-...,25,0
897,Understanding Financial Statements,0.666667,https://www.udemy.com/understanding-financial-...,25,0
931,How to Read Financial Statements,0.666667,https://www.udemy.com/how-to-read-financial-st...,20,5
946,Interpreting financial statements,0.666667,https://www.udemy.com/financial-statements-foc...,0,2119
1160,Financial Accounting Made Easy for All,0.666667,https://www.udemy.com/financialaccounting/,20,87
475,How to Read Financial Statements: Build Finan...,0.612372,https://www.udemy.com/learn-how-to-read-financ...,25,188
132,Building Financial Statements in Excel,0.57735,https://www.udemy.com/guide-to-building-financ...,35,1181


Save the cleaned file into a new file

In [65]:
df.to_csv('udemy_cleaned.csv',index = None)