# Libraries

In [106]:
import pandas as pd
import numpy as np
import math

from textblob import TextBlob

import importlib as imp

import matplotlib.pyplot as plt

# Read data

In [2]:
data = pd.read_csv('./data/udemy_courses.csv')
display(data.head())

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance


# Exploring data

In [161]:
from source import helper
from source import pipeline
imp.reload(helper);
imp.reload(pipeline);

## Initial findings

__Findings__:
* The `course_id` field is type of int64. However, the maximum value of this field is equal to 1282064. As a result we can change its type into int32 for saving memory space.
* The `is_paid` field might be a Boolean field, which contains only True and False. As a result, we can convert its type into int8 for saving memory space.
* Normally, the `price` field is type of int or float. However, in this case, it is type of object. Therefore, we need to have some investigations here.
* Three counting fields such as `num_subscribers`, `num_reviews`, and `num_lectures` are type of int. However, their maximum value is different from each other. As a result, we need to change the memory size here.
    * `num_subscribers`: int64 -> int32
    * `num_reviews`: int64 -> int16
    * `num_lectures`: int64 -> int 16
* The `level` field might be a categorical variable. We will need to find whether or not it contains unusual data points such as empty string, non-readable string, etc.
* Normally, the `content_duration` is type of int or float. However, in this case, its type is object. As a result, we need some investigations to transform the data.
* For now, the meaningful data of the `published_timestamp` field might be year. This is because we can use this information to search for which are popular technical topics in a particular year.
* The `subject` field might be a categorical variable. We will need to find whether or not it contains unusual data points such as empty string, non-readable string, etc. Also, this data can be use for the evaluation.

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   course_id            3683 non-null   int64 
 1   course_title         3683 non-null   object
 2   url                  3683 non-null   object
 3   is_paid              3683 non-null   object
 4   price                3683 non-null   object
 5   num_subscribers      3683 non-null   int64 
 6   num_reviews          3683 non-null   int64 
 7   num_lectures         3683 non-null   int64 
 8   level                3683 non-null   object
 9   content_duration     3683 non-null   object
 10  published_timestamp  3683 non-null   object
 11  subject              3683 non-null   object
dtypes: int64(4), object(8)
memory usage: 345.4+ KB


In [5]:
data[['num_subscribers', 'num_reviews', 'num_lectures']].describe()

Unnamed: 0,num_subscribers,num_reviews,num_lectures
count,3683.0,3683.0,3683.0
mean,3193.371165,156.448004,40.062178
std,9498.231406,935.078241,50.366788
min,0.0,0.0,0.0
25%,110.0,4.0,15.0
50%,911.0,18.0,25.0
75%,2537.5,67.0,45.0
max,268923.0,27445.0,779.0


__Findings__:
* There is not NaN or missing value in the initial data.

In [6]:
helper.check_nan(data)

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

## Going deeper into each feature

__course_id feature__:
* There are duplicate courses in the data. Therefore, courses that exist more than 2 times need to be merged.
* The type of the feature needs to be converted into np.int32

In [7]:
# # Pre-data
# print("Pre-data:")
# display(helper.describe_freq(data, 'course_id', 2))

# # Process
# data = helper.merge_duplicate_row(data, 'course_id')
# data['course_id'] = data['course_id'].astype(np.int32)

# # Post-data
# print("Post-data:")
# display(helper.describe_freq(data, 'course_id', 2))

__is_paid feature__:
* There is unusual value, it needs to be fill by np.nan.

In [8]:
# # Pre-data
# print("Pre-data:")
# display(helper.describe_freq(data, 'is_paid', 1))

# # Process
# data['is_paid'] = data['is_paid'].apply(lambda x: x.lower()) \
#                                     .apply(lambda x: helper.convert_bool_to_int(x))
# data.dropna(subset=['is_paid'], inplace=True)
# data['is_paid'] = data['is_paid'].astype(np.int8)

# # Post-data
# print("Post-data:")
# display(helper.describe_freq(data, 'is_paid', 1))
# print("NaN values: {}".format(helper.check_nan(data['is_paid'])))

__course_title feature:__
* Found that courses whose title has length of less than 3 might be multilingual courses. Therefore, those title need to be replaced by the text in the url feature.

__url feature:__
* Found that there are urls that contain multi-words without spacing. 

In [9]:
# print("Title length: 1")
# display(helper.check_text_length(data, 'course_title', 1).head())
# print("Title length: 2")
# display(helper.check_text_length(data, 'course_title', 2).head())

In [10]:
# data = helper.convert_url_to_string(data, 'url')
# print("URL length: 1")
# display(helper.check_text_length(data, 'url', 1).head())

In [11]:
# helper.check_text_length(pipeline.clean_data(data), 'course_title', 2).head()

__price feature:__
* Found that the price feature should be type of integer, but object.
* Found that 'Free' is in the price feature. This should set to be 0

In [32]:
# # Pre-data
# print("Pre-data:")
# display(np.unique(data['price']))

# # Process
# data = helper.convert_free_price(data, 'price')

# # Post-data
# print("Post-data:")
# display(np.unique(data['price']))

__level feature:__
* Found that the level feature contains 4 unique values. These values can be dummied for the further analysis.

In [42]:
# display(np.unique(data['level']))
# data = pd.get_dummies(data, columns=['level'])

array(['All Levels', 'Beginner Level', 'Expert Level',
       'Intermediate Level'], dtype=object)

__content_duration feature:__
* Found that the there are 4 different categories in this feature.
    * '0' indicates there is no course duration. This needs to be removed
    * 'hour', 'hours' or 'mins' indicate the course duration calculated by using the time factor.
    * 'questions' indicates the course duration calculated by using the solved time factor. Assuming that each question can be solved in 2.5 minutes.
* To make the feature has a consistent unit, all observations are converted into hour(s).

In [78]:
# display(np.unique(data['content_duration']))
# display(data[data['content_duration'].isin(['218 questions', '24 questions', '82 questions', '0'])])
# data['content_duration'] = data['content_duration'].apply(lambda duration: helper.convert_duration(duration))
# data['content_duration'] = data['content_duration'].astype(np.float32)
# data.dropna(subset=['content_duration'], inplace=True)

__published_timestamp feature:__
* Found that all of the courses in this dataset were published from 2011 to 2017. This information can be used for further analysis when trying to match the trending topics in each year. 

In [98]:
# display(np.unique(data['published_timestamp'].apply(lambda time: time.split("-")[0])))
# data = helper.convert_published_time(data, 'published_timestamp')

__subject feature:__
* Found that the subject feature contains 4 unique values. These values can be dummied for the further analysis.

In [104]:
# display(np.unique(data['subject']))
# data = pd.get_dummies(data, columns=['subject'])

# Finding the definition of popularity

## Price, subscribers, reviews and lectures

On average, we can see that most of the courses in dataset are low price courses. In addition, most of these courses has a low number of subscribers, a low number of lectures, and a low number of reviews. In order to define the meaning of popularity in this senario, we need to dig into courses that have a high number of subscribers as well as a high number of reviews.

In [171]:
data[['price', 'num_subscribers', 'num_reviews', 'num_lectures']].describe()

Unnamed: 0,price,num_subscribers,num_reviews,num_lectures
count,3675.0,3675.0,3675.0,3675.0
mean,66.07619,3188.536327,156.247619,40.107483
std,61.017878,9484.511154,935.806395,50.409554
min,0.0,0.0,0.0,0.0
25%,20.0,111.5,4.0,15.0
50%,45.0,912.0,18.0,25.0
75%,95.0,2545.0,67.0,46.0
max,200.0,268923.0,27445.0,779.0


In [174]:
display(helper.describe_crosstab(data, 'num_subscribers', 'price'))
display(helper.describe_crosstab(data, 'num_lectures', 'price'))
display(helper.describe_crosstab(data, 'num_reviews', 'price'))

Unnamed: 0,low_price,high_price
low_num_subscribers,2043,862
high_num_subscribers,439,331


Unnamed: 0,low_price,high_price
low_num_lectures,1966,639
high_num_lectures,516,554


Unnamed: 0,low_price,high_price
low_num_reviews,2217,952
high_num_reviews,265,241
