In [1]:
# import modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
from pyspark.sql.functions import regexp_replace, lower
from pyspark.sql.types import IntegerType

In [2]:
spark = SparkSession.builder.appName("CourseraCourseAnalytics").getOrCreate()


24/03/06 15:32:56 WARN Utils: Your hostname, paulet-HP-EliteBook-Folio-9470m resolves to a loopback address: 127.0.1.1; using 172.17.0.1 instead (on interface docker0)
24/03/06 15:32:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/06 15:33:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# load dataset
df = spark.read.csv("CourseraDataset-Unclean.csv", header=True, inferSchema=True)

                                                                                

In [4]:
# show schema
print('Displaying Dataframe schema')
df.printSchema()

Displaying Dataframe schema
root
 |-- Course Title: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Duration: string (nullable = true)
 |-- Schedule: string (nullable = true)
 |-- Review: string (nullable = true)
 |-- What you will learn: string (nullable = true)
 |-- Skill gain: string (nullable = true)
 |-- Modules: string (nullable = true)
 |-- Instructor: string (nullable = true)
 |-- Offered By: string (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Course Url: string (nullable = true)



In [5]:
# display first few rows
df.show(5)

                                                                                

+--------------------+------+--------------+--------------------+-----------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+
|        Course Title|Rating|         Level|            Duration|         Schedule|       Review| What you will learn|          Skill gain|             Modules|          Instructor|          Offered By|            Keyword|          Course Url|
+--------------------+------+--------------+--------------------+-----------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+
|   Fashion as Design|   4.8|Beginner level|20 hours (approxi...|Flexible schedule|2,813 reviews|                NULL|['Art History', '...|['Introduction', ...|['Anna Burckhardt...|['The Museum of M...|Arts and Humanities|https://www.cours...|
|Modern American P...|  

In [6]:
df.describe().show(5)

24/03/06 15:34:45 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 5:>                                                          (0 + 1) / 1]

+-------+------------------------------+--------------------+----------------+--------------------+--------------------+------------------+---------------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|summary|                  Course Title|              Rating|           Level|            Duration|            Schedule|            Review|              What you will learn|          Skill gain|             Modules|          Instructor|          Offered By|             Keyword|          Course Url|
+-------+------------------------------+--------------------+----------------+--------------------+--------------------+------------------+---------------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  count|                          9834|                8305|            8460|                9455| 

                                                                                

## data Cleaning and preprocessing

In [7]:
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()



+------------+------+-----+--------+--------+------+-------------------+----------+-------+----------+----------+-------+----------+
|Course Title|Rating|Level|Duration|Schedule|Review|What you will learn|Skill gain|Modules|Instructor|Offered By|Keyword|Course Url|
+------------+------+-----+--------+--------+------+-------------------+----------+-------+----------+----------+-------+----------+
|           5|  1534| 1379|     384|     806|  1566|               4734|       288|    324|       334|       345|    348|       349|
+------------+------+-----+--------+--------+------+-------------------+----------+-------+----------+----------+-------+----------+



                                                                                

In [7]:
# drop missing values 
df = df.na.drop()

In [8]:
# drop duplicates
df = df.dropDuplicates()

In [9]:
# standardize course title ext
df = df.withColumn("Course Title", lower(regexp_replace(df["Course Title"], "[^a-zA-Z0-9\\s]", "")))
df.select("Course Title").show()

[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+
|        Course Title|
+--------------------+
|lean management f...|
|mathematics for e...|
|artificial intell...|
|   google profess...|
|data science and ...|
|data storage in m...|
|ciberseguridad sp...|
|      google prof...|
|ai capstone proje...|
|linux system prog...|
|    learn javascript|
|health after canc...|
|population health...|
|clinical trials o...|
|children acquirin...|
|operating systems...|
|exploring data tr...|
|cloud computing p...|
|splunk search exp...|
|essential linear ...|
+--------------------+
only showing top 20 rows



                                                                                

In [10]:
# remove reviews from "reviews" and 
df = df.withColumn("Review", regexp_replace("Review", "reviews", '').cast(IntegerType()))
df.select('Review').show()

[Stage 11:>                                                         (0 + 1) / 1]

+------+
|Review|
+------+
|    10|
|   572|
|   392|
|   116|
|    20|
|   156|
|   221|
|  NULL|
|   502|
|    51|
|    49|
|    75|
|    17|
|   228|
|   162|
|    24|
|   737|
|    32|
|    34|
|   126|
+------+
only showing top 20 rows



                                                                                

### Data Analysis

In [11]:
df.filter(col('Rating') > 4).show()



+--------------------+------+------------------+--------------------+-----------------+------+-------------------------------+----------------------------+-----------------------------+--------------------+--------------------+--------------------+--------------------+
|        Course Title|Rating|             Level|            Duration|         Schedule|Review|            What you will learn|                  Skill gain|                      Modules|          Instructor|          Offered By|             Keyword|          Course Url|
+--------------------+------+------------------+--------------------+-----------------+------+-------------------------------+----------------------------+-----------------------------+--------------------+--------------------+--------------------+--------------------+
|cyber security  t...|   5.0|    Beginner level|4 months at 5 hou...|Flexible schedule|    17|           Some of the issue...|        ['Computer Securi...|         ['Introduction to...|['Pro

                                                                                

In [13]:
df.groupBy("Course Title").avg("Review").show()

[Stage 17:>                                                         (0 + 1) / 1]

+--------------------+-----------+
|        Course Title|avg(Review)|
+--------------------+-----------+
|history of mental...|       99.0|
|recommender syste...|      697.0|
|scrum master cert...|      723.0|
|transforming the ...|      390.0|
|applied data scie...|       NULL|
|handson internet ...|      132.0|
|introduction to c...|      696.0|
|managing google w...|      952.0|
|gpt vision seeing...|       11.0|
|sensor manufactur...|      255.0|
|      what and wh...|      214.0|
|transistor  field...|       73.0|
|objetivos de desa...|      665.0|
|songwriting writi...|      845.0|
|point of care tes...|       16.0|
|artificial intell...|      167.0|
|hardware descript...|      546.0|
|analitik data goo...|       NULL|
|       digital twins|       55.0|
|introduction to g...|       NULL|
+--------------------+-----------+
only showing top 20 rows



                                                                                

In [15]:
df.orderBy("Rating", ascending=True).show()



+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        Course Title|              Rating|               Level|            Duration|            Schedule|Review| What you will learn|          Skill gain|             Modules|          Instructor|          Offered By|             Keyword|          Course Url|
+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        data science|          'Big Data'| 'Python Programm...| 'Bash (Unix Shell)'|   'Database (DBMS)'|  NULL|               'SQL'| 'Web Application']"|['Python and Pand...|['Alfredo Deza', ...| ['Duke University']

                                                                                

Partitioning allows you to control the distribution of data across partitions for efficient processing. It ensures that related data is stored together, reducing shuffle operations during transformations.

In [16]:
df_repartitioned = df.repartition("Course Title")
num_partitions = df_repartitioned.rdd.getNumPartitions()
print("Number of partitions:", num_partitions)


[Stage 29:>                                                         (0 + 1) / 1]

Number of partitions: 1


Caching allows you to persist DataFrame in memory or disk for faster access during iterative or repeated computations. It is useful when you need to reuse a DataFrame across multiple actions or transformations.

In [24]:
df.cache()

DataFrame[Course Title: string, Rating: string, Level: string, Duration: string, Schedule: string, Review: int, What you will learn: string, Skill gain: string, Modules: string, Instructor: string, Offered By: string, Keyword: string, Course Url: string]

In [26]:
df.unpersist() #unperist from cache

DataFrame[Course Title: string, Rating: string, Level: string, Duration: string, Schedule: string, Review: int, What you will learn: string, Skill gain: string, Modules: string, Instructor: string, Offered By: string, Keyword: string, Course Url: string]