In [1]:
import pyspark
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
#Create SparkSession
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()

# Data Processing

In [2]:
import os
UdemyDF = spark.read.csv('udemy_tech.csv',header=True, inferSchema=True )
UdemyDF=UdemyDF.withColumn("Enrollment",UdemyDF.Enrollment.cast('int'))
UdemyDF=UdemyDF.withColumn("Stars",UdemyDF.Stars.cast('double'))
UdemyDF=UdemyDF.withColumn("Rating",UdemyDF.Rating.cast('int'))
UdemyDF.printSchema()
UdemyDF.show()

root
 |-- index: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Enrollment: integer (nullable = true)
 |-- Stars: double (nullable = true)
 |-- Rating: integer (nullable = true)
 |-- Link: string (nullable = true)

+-----+--------------------+--------------------+----------+--------+------+--------------------+
|index|               Title|             Summary|Enrollment|   Stars|Rating|                Link|
+-----+--------------------+--------------------+----------+--------+------+--------------------+
|    0|2020 Complete Pyt...|Learn Python like...|   1100093|     4.6|318066|https://www.udemy...|
|    1|The Web Developer...|The only course y...|    596726|     4.6|182997|https://www.udemy...|
|    2|Machine Learning ...|Learn to create M...|    692812|     4.5|132228|https://www.udemy...|
|    3|Angular - The Com...|"Master Angular 1...|      null|422557.0|     4|              129984|
|    4|Java Programming ...|Learn Java 

In [3]:
CourseraDF = spark.read.csv('Coursera.csv',header=True, inferSchema=True )
CourseraDF.show()

+--------------------+--------------------+----------------+--------------+--------------------+--------------------+--------------------+
|         Course Name|          University|Difficulty Level| Course Rating|          Course URL|  Course Description|              Skills|
+--------------------+--------------------+----------------+--------------+--------------------+--------------------+--------------------+
|Write A Feature L...|Michigan State Un...|        Beginner|           4.8|https://www.cours...|"Write a Full Len...| writing! You wil...|
|Business Strategy...|Coursera Project ...|        Beginner|           4.8|https://www.cours...|By the end of thi...|Finance  business...|
|Silicon Thin Film...| �cole Polytechnique|        Advanced|           4.1|https://www.cours...|"This course cons...| thin film semico...|
|Finance for Managers|IESE Business School|    Intermediate|           4.8|https://www.cours...|When it comes to ...|accounts receivab...|
|Retrieve Data usi...|Cours

In [4]:
CourseraDF.printSchema()

root
 |-- Course Name: string (nullable = true)
 |-- University: string (nullable = true)
 |-- Difficulty Level: string (nullable = true)
 |-- Course Rating: string (nullable = true)
 |-- Course URL: string (nullable = true)
 |-- Course Description: string (nullable = true)
 |-- Skills: string (nullable = true)



In [5]:
CourseraDF = CourseraDF\
  .withColumn('Course Rating', CourseraDF['Course Rating'].cast('double'))\
   .withColumnRenamed('Course Name', 'Title')\
.withColumnRenamed('Course Description', 'Summary')\
.withColumnRenamed('Course Rating', 'Stars')\
.withColumnRenamed('Course URL', 'Link')


In [6]:
CourseraDF.printSchema()


root
 |-- Title: string (nullable = true)
 |-- University: string (nullable = true)
 |-- Difficulty Level: string (nullable = true)
 |-- Stars: double (nullable = true)
 |-- Link: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Skills: string (nullable = true)



In [7]:
UdemyDF.printSchema()

root
 |-- index: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Enrollment: integer (nullable = true)
 |-- Stars: double (nullable = true)
 |-- Rating: integer (nullable = true)
 |-- Link: string (nullable = true)



In [8]:
from pyspark.sql.functions import monotonically_increasing_id,lit
CourseraDF = CourseraDF.withColumn("index", lit(9964) + monotonically_increasing_id())

In [9]:
CourseraDF.printSchema()

root
 |-- Title: string (nullable = true)
 |-- University: string (nullable = true)
 |-- Difficulty Level: string (nullable = true)
 |-- Stars: double (nullable = true)
 |-- Link: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Skills: string (nullable = true)
 |-- index: long (nullable = false)



In [10]:
CourseraDF = CourseraDF.select([c for c in CourseraDF.columns if (c != 'University' and c != 'Difficulty Level' and c != 'Skills')])

In [11]:
CourseraDF.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Stars: double (nullable = true)
 |-- Link: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- index: long (nullable = false)



In [12]:
UdemyDF = UdemyDF.select([c for c in UdemyDF.columns if (c != 'Enrollment' and c != 'Rating')])

In [13]:
UdemyDF.printSchema()

root
 |-- index: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Stars: double (nullable = true)
 |-- Link: string (nullable = true)



In [14]:
CourseraDF = CourseraDF.select("index", "Title", "Summary","Stars","Link")

In [15]:
CourseraDF.printSchema()

root
 |-- index: long (nullable = false)
 |-- Title: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Stars: double (nullable = true)
 |-- Link: string (nullable = true)



In [16]:
from pyspark.sql.functions import lit

CourseraDF = CourseraDF.withColumn("Source", lit('Coursera'))
UdemyDF = UdemyDF.withColumn("Source", lit('Udemy'))

In [17]:
CourseraDF.printSchema()
UdemyDF.printSchema()

root
 |-- index: long (nullable = false)
 |-- Title: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Stars: double (nullable = true)
 |-- Link: string (nullable = true)
 |-- Source: string (nullable = false)

root
 |-- index: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Stars: double (nullable = true)
 |-- Link: string (nullable = true)
 |-- Source: string (nullable = false)



In [18]:
CourseraDF.show()

+-----+--------------------+--------------------+-----+--------------------+--------+
|index|               Title|             Summary|Stars|                Link|  Source|
+-----+--------------------+--------------------+-----+--------------------+--------+
| 9964|Write A Feature L...|"Write a Full Len...|  4.8|https://www.cours...|Coursera|
| 9965|Business Strategy...|By the end of thi...|  4.8|https://www.cours...|Coursera|
| 9966|Silicon Thin Film...|"This course cons...|  4.1|https://www.cours...|Coursera|
| 9967|Finance for Managers|When it comes to ...|  4.8|https://www.cours...|Coursera|
| 9968|Retrieve Data usi...|In this course yo...|  4.6|https://www.cours...|Coursera|
| 9969|Building Test Aut...|Selenium is one o...|  4.7|https://www.cours...|Coursera|
| 9970|Doing Business in...|Doing Business in...|  3.3|https://www.cours...|Coursera|
| 9971|Programming Langu...|This course is an...|  4.9|https://www.cours...|Coursera|
| 9972|The Roles and Res...|This course provi...|  4.3

In [19]:
UdemyDF.show()

+-----+--------------------+--------------------+--------+--------------------+------+
|index|               Title|             Summary|   Stars|                Link|Source|
+-----+--------------------+--------------------+--------+--------------------+------+
|    0|2020 Complete Pyt...|Learn Python like...|     4.6|https://www.udemy...| Udemy|
|    1|The Web Developer...|The only course y...|     4.6|https://www.udemy...| Udemy|
|    2|Machine Learning ...|Learn to create M...|     4.5|https://www.udemy...| Udemy|
|    3|Angular - The Com...|"Master Angular 1...|422557.0|              129984| Udemy|
|    4|Java Programming ...|Learn Java In Thi...|     4.6|https://www.udemy...| Udemy|
|    5|React - The Compl...|Dive in and learn...|     4.6|https://www.udemy...| Udemy|
|    6|The Complete 2020...|Become a full-sta...|     4.7|https://www.udemy...| Udemy|
|    7|Python for Data S...|Learn how to use ...|     4.6|https://www.udemy...| Udemy|
|    8|The Complete Java...|Master JavaScri

In [20]:
DF = UdemyDF.union(CourseraDF)

In [21]:
DF.show(20000)

+-----+--------------------+--------------------+--------+--------------------+--------+
|index|               Title|             Summary|   Stars|                Link|  Source|
+-----+--------------------+--------------------+--------+--------------------+--------+
|    0|2020 Complete Pyt...|Learn Python like...|     4.6|https://www.udemy...|   Udemy|
|    1|The Web Developer...|The only course y...|     4.6|https://www.udemy...|   Udemy|
|    2|Machine Learning ...|Learn to create M...|     4.5|https://www.udemy...|   Udemy|
|    3|Angular - The Com...|"Master Angular 1...|422557.0|              129984|   Udemy|
|    4|Java Programming ...|Learn Java In Thi...|     4.6|https://www.udemy...|   Udemy|
|    5|React - The Compl...|Dive in and learn...|     4.6|https://www.udemy...|   Udemy|
|    6|The Complete 2020...|Become a full-sta...|     4.7|https://www.udemy...|   Udemy|
|    7|Python for Data S...|Learn how to use ...|     4.6|https://www.udemy...|   Udemy|
|    8|The Complete J

In [22]:
DF = DF.na.fill({'Title':0.0,'Summary':0.0,'Stars':0.0,'Link':0.0}).na.fill('')

In [23]:
df=DF

# TF IDF


In [24]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
columns = ['Title', 'Summary']
minDFs = {'Title':2.0, 'Summary':4.0}
preProcStages = []
for col in columns:
    regexTokenizer = RegexTokenizer(gaps=False, pattern='\w+', inputCol=col, outputCol=col+'Token')
    stopWordsRemover = StopWordsRemover(inputCol=col+'Token', outputCol=col+'SWRemoved')
    countVectorizer = CountVectorizer(minDF=minDFs[col], inputCol=col+'SWRemoved', outputCol=col+'TF')
    idf = IDF(inputCol=col+'TF', outputCol=col+'IDF') 
    preProcStages += [regexTokenizer, stopWordsRemover, countVectorizer, idf]
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=preProcStages)

In [25]:
model = pipeline.fit(df)
df = model.transform(df)
df = df.select('index', 'TitleIDF', 'SummaryIDF','Stars','Link','Source')


In [26]:
df.sample(False, .01).show()

+-----+--------------------+--------------------+-----+--------------------+------+
|index|            TitleIDF|          SummaryIDF|Stars|                Link|Source|
+-----+--------------------+--------------------+-----+--------------------+------+
|   16|(3548,[3,6,15,34,...|(7646,[0,43,63,15...|  4.8|https://www.udemy...| Udemy|
|  202|(3548,[3,7,17,56,...|(7646,[2,27,60,22...|  4.4|https://www.udemy...| Udemy|
|  361|(3548,[21,33,58,2...|(7646,[3,7,39,70,...|  4.3|https://www.udemy...| Udemy|
|  431|(3548,[0,5,12,20,...|(7646,[10,24,25,5...|  4.1|https://www.udemy...| Udemy|
|  598|(3548,[17,31,41,9...|(7646,[1,9,17,22,...|  4.7|https://www.udemy...| Udemy|
|  612|(3548,[8,21,48,63...|(7646,[125,146,19...|  4.7|https://www.udemy...| Udemy|
|  695|(3548,[10,52,59,1...|(7646,[0,6,29,35,...|  4.4|https://www.udemy...| Udemy|
|  880|(3548,[1,21,30,76...|(7646,[16,345,603...|  4.8|https://www.udemy...| Udemy|
|  925|(3548,[5,10,16,83...|(7646,[3,10,11,24...|  4.5|https://www.udemy...|

# Cosine similarity

In [27]:
import math

def cosine_similarityy(X, Y):
    denom = X.norm(2) * Y.norm(2)
    if denom == 0.0:
        return -1.0
    else:
        return X.dot(Y) / float(denom)

# Search

In [28]:
DF_pandas=DF
DF_pandas = DF_pandas.toPandas()
TDF=DF.toPandas()
RDF=DF.toPandas()


In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cs
string = input()


python


In [None]:

def search(string,DF_pandas):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(DF_pandas['Title'])
    string_vector = vectorizer.transform([string])
    cosine_sim = cs(string_vector, vectors)
    cos=[]
    for i in range(len(DF_pandas['Title'])):
        cos.append(cosine_sim[0][i])
    DF_pandas['cosine_sim']=cos
    DF_pandas=DF_pandas.sort_values(by=["cosine_sim"], ascending=False)
    return(DF_pandas.head(20))
search(string,DF_pandas)


In [None]:
plotDF=DF_pandas.iloc[:15,:]

plt.plot(plotDF['index'],plotDF['cosine_sim'])
plt.xlabel('course index')
plt.ylabel('cosine similarity')
plt.title('Index vs cosine similarity for Search ')

# Title based recommendation

In [None]:
x=int(input())
cdf=DF.filter(DF['index'] == x)\
    .select('index','Title', 'Summary','Stars','Link')
cdf.show(truncate=False)

In [None]:
data_collect = df.collect()

In [None]:
def Sort(sub_li):
    l = len(sub_li)
    for i in range(0, l):
        for j in range(0, l-i-1):
            if (sub_li[j][1] < sub_li[j + 1][1]):
                tempo = sub_li[j]
                sub_li[j]= sub_li[j + 1]
                sub_li[j + 1]= tempo
    return sub_li

In [None]:
def recomm(x,TDF):
    gProd1 = df.filter(df['index'] == x).collect()[0]
    l=[]
    for row in data_collect:
        c=cosine_similarityy(row['TitleIDF'], gProd1['TitleIDF'])
        i=row['index']
        l+=[c]
    tit=[]
      
    for i in range(len(TDF['Title'])):
        tit.append(l[i])
    TDF['titlesim']=tit
    TDF1=TDF.sort_values(by=["titlesim"], ascending=False)
    TDF1=TDF1.iloc[1:,:]
    
    return(TDF1.head(10))
    
            
        

In [None]:
recomm(x,TDF)

In [None]:
plotDF1=TDF.iloc[:15,:]

plt.plot(plotDF1['index'],plotDF1['titlesim'])
plt.xlabel('course index')
plt.ylabel('Title similarity')
plt.title('Index vs Title similarity for Recommendation ')

# content based recommendation

In [None]:
from pyspark.sql.types import *
columns = StructType([StructField('index',
                                  StringType(), True),
                    StructField('Title',
                                StringType(), True),
                    StructField('Summary',
                                StringType(), True),
                    StructField('Stars',
                                StringType(), True),
                     StructField('Link',
                                StringType(), True),
                     StructField('Source',
                                StringType(), True)])

df_combined = spark.createDataFrame([], schema=columns)

In [None]:
import pyspark.sql.functions as f
def recommend(x,RDF):
    gProd1 = df.filter(df['index'] == x).collect()[0]
    l=[]
    for row in data_collect:
        c=cosine_similarityy(row['SummaryIDF'], gProd1['SummaryIDF'])
        i=row['index']
        l+=[(c)]
    rec=[]
      
    for i in range(len(RDF['Title'])):
        rec.append(l[i])
    RDF['sim']=rec
    RDF1=RDF.sort_values(by=["sim"], ascending=False)
    RDF1=RDF1.iloc[1:,:]
    
    return(RDF1.head(10))
        

In [None]:
ci=int(input())
print(DF.filter(DF['index'] == ci )\
              .select('index','Title', 'Summary','Stars','Link','Source')\
             .show(truncate=False))


In [None]:
recommend(ci,RDF)

In [None]:
plotDF2=RDF.iloc[:15,:]

plt.plot(plotDF2['index'],plotDF2['sim'])
plt.xlabel('course index')
plt.ylabel('Summary similarity')
plt.title('Index vs Summary similarity for Recommendation ')

# Top rated courses

In [None]:
def toprated(ci,RDF):
    return(recommend(ci,RDF).sort_values(by=["Stars"], ascending=False))

In [None]:
toprated(ci,RDF)

# Compare

In [39]:
def compare(x,y):
    dff=TDF.iloc[x].values
    df2=TDF.iloc[y].values
    data=[]
    for i in range(len(dff)):
        data.append(str(dff[i]))
    data.pop(0)
    data.pop()
    cdf=pd.DataFrame(['Tile','Summary', 'Rating','Link','Source'])
    cdf['ind']=['Tile','Summary', 'Rating','Link','Source']
    data2=[]
    for i in range(len(df2)):
        data2.append(str(df2[i]))
    data2.pop(0)
    data2.pop()
    cdf[1]=data2
    cdf[2]=data
    return cdf

In [42]:
x=compare(1,2).set_index('ind')
x

Unnamed: 0_level_0,0,1,2
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tile,Tile,Machine Learning A-Z™: Hands-On Python & R In ...,The Web Developer Bootcamp
Summary,Summary,Learn to create Machine Learning Algorithms in...,The only course you need to learn web developm...
Rating,Rating,4.5,4.6
Link,Link,https://www.udemy.com/course/machinelearning/,https://www.udemy.com/course/the-web-developer...
Source,Source,Udemy,Udemy


In [None]:
c=cosine_similarityy(data_collect[780]['SummaryIDF'], data_collect[3288]['SummaryIDF'])

In [None]:
print( "similarity score",c)