# Libraries

In [3]:
import pandas as pd
import numpy as np
import math

from textblob import TextBlob

import importlib as imp

# Read data

In [24]:
data = pd.read_csv('./data/udemy_courses.csv')
display(data.head())

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance


# Exploring data

In [None]:
from source import helper
imp.reload(helper);

__Findings__:
* The `course_id` field is type of int64. However, the maximum value of this field is equal to 1282064. As a result we can change its type into int32 for saving memory space.
* The `is_paid` field might be a Boolean field, which contains only True and False. As a result, we can convert its type into int8 for saving memory space.
* Normally, the `price` field is type of int or float. However, in this case, it is type of object. Therefore, we need to have some investigations here.
* Three counting fields such as `num_subscribers`, `num_reviews`, and `num_lectures` are type of int. However, their maximum value is different from each other. As a result, we need to change the memory size here.
    * `num_subscribers`: int64 -> int32
    * `num_reviews`: int64 -> int16
    * `num_lectures`: int64 -> int 16
* The `level` field might be a categorical variable. We will need to find whether or not it contains unusual data points such as empty string, non-readable string, etc.
* Normally, the `content_duration` is type of int or float. However, in this case, its type is object. As a result, we need some investigations to transform the data.
* For now, the meaningful data of the `published_timestamp` field might be year. This is because we can use this information to search for which are popular technical topics in a particular year.
* The `subject` field might be a categorical variable. We will need to find whether or not it contains unusual data points such as empty string, non-readable string, etc. Also, this data can be use for the evaluation.

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   course_id            3683 non-null   int64 
 1   course_title         3683 non-null   object
 2   url                  3683 non-null   object
 3   is_paid              3683 non-null   object
 4   price                3683 non-null   object
 5   num_subscribers      3683 non-null   int64 
 6   num_reviews          3683 non-null   int64 
 7   num_lectures         3683 non-null   int64 
 8   level                3683 non-null   object
 9   content_duration     3683 non-null   object
 10  published_timestamp  3683 non-null   object
 11  subject              3683 non-null   object
dtypes: int64(4), object(8)
memory usage: 345.4+ KB


In [44]:
data[['num_subscribers', 'num_reviews', 'num_lectures']].describe()

Unnamed: 0,num_subscribers,num_reviews,num_lectures
count,3683.0,3683.0,3683.0
mean,3193.371165,156.448004,40.062178
std,9498.231406,935.078241,50.366788
min,0.0,0.0,0.0
25%,110.0,4.0,15.0
50%,911.0,18.0,25.0
75%,2537.5,67.0,45.0
max,268923.0,27445.0,779.0


__Findings__:
* There is not NaN or missing value in the initial data.

In [9]:
helper.check_nan(data)

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64