# UDEMY PROJECT

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
from ydata_profiling import ProfileReport

ModuleNotFoundError: No module named 'ydata_profiling'

In [None]:
df = pd.read_csv('udemy_courses-raw.csv')

In [None]:
df.head(2)

In [None]:
df.sample(2)

In [None]:
df.info()

In [None]:
df.describe()

## Data Processing

Categorical_Data

In [None]:
df['subject'].value_counts()

In [None]:
df['level'].value_counts()

In [None]:
df['is_paid'].value_counts()

Parsing timestamp

In [None]:
df['published_timestamp']

Method helps to convert string Date time into Python Date time object.

In [None]:
pd.to_datetime(df['published_timestamp'])

In [None]:
df.dtypes

to save the new parsed value

In [None]:
df['published_timestamp'] = pd.to_datetime(df['published_timestamp'])

In [None]:
df.dtypes

Creating 3 new time columns for future use..

In [None]:
df['Year'] = df['published_timestamp'].dt.year

In [None]:
df['Month'] = df['published_timestamp'].dt.month_name()

In [None]:
df['Day'] = df['published_timestamp'].dt.day_name()

In [None]:
df['Quarter'] = df['published_timestamp'].dt.quarter

To view the added columns...

In [None]:
df.sample()

We will be setting the Index with respect to timestamp for better DATA_VISUALIZATION


In [None]:
df.set_index(df['published_timestamp'], inplace = True)

In [None]:
df.sample()

We will add a new column 'Profit'

In [None]:
df['Profit'] = df['price'] * df['num_subscribers']

In [None]:
df.sample()

Checking for the Duplicated values in the UNIQUE Course ID 

In [None]:
df['course_id'].nunique()

By comparing 3672 and the total values found in the data_set (3678) we will find 6 Duplicated Course_id,
So the next step is detecting the duplicated values

In [None]:
df[df.duplicated()]

To drop the duplicated values 

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

Categorization for numerical data, Since the content 'Duration' is in numeric values we can categorize it for future benfits in the project Dashboard.

In [None]:
df['content_duration'].value_counts()

In [None]:
df['Duration'] = pd.qcut(df['content_duration'], 6, labels = ['0:1', '1:3','3:7','7:12', '12:20', '20+'])

In [None]:
df.sample()

In [None]:
df.rename(columns = {'Duration':'Duration_category'}, inplace = True)

In [None]:
df.sample()

In [None]:
df['Duration_category'].value_counts()

Categorization for numerical data, Since the 'price' is in numeric values we can categorize it for future benfits in the project Dashboard.

In [None]:
df['price'].value_counts()

In [None]:
price_dict = {
    0: "Free",
    range (20, 55, 5):'20:50',
    range (55, 105, 5): '55:100',
    range (105, 155, 5): '105:150',
    range (155, 205, 5): '155:200'
}

In [None]:
df['Price_Category'] = df['price'].replace(price_dict)

In [None]:
df.sample()

## Eploratory Data Analysis (EDA)

- Univariate Analysis
- BI-Variate Analysis
- Multi-Variate Analysis

## EDA for feature "Subject"

### 1) Uni-variate Analysis

In [None]:
df['subject']

#### A. Feature overview

In [None]:
df['subject'].describe()

In [None]:
df['subject'].count()

In [None]:
df['subject'].nunique()

#### B. How many courses in each subject? 

In [None]:
df['subject'].value_counts()

In [None]:
df['subject'].value_counts().plot(kind='bar')

In [None]:
df['subject'].value_counts().plot(kind='pie')

### 2) Bi-Variate Analysis

#### A. How Many subscribers in each subject?

In [None]:
df.groupby('subject').sum()['num_subscribers']

In [None]:
pip install nbformat==5.1.2

##### VISUALIZATION Using Plotly

In [None]:
fig = px.bar(data_frame = df,
       x = df.groupby('subject').sum()['num_subscribers'].index,
       y = df.groupby('subject').sum()['num_subscribers'].values)
fig.update_xaxes(title = 'Subject')
fig.update_yaxes(title = '# Subscribers')
fig.show()

#### B. Profit per each subject?

In [None]:
dict(df.groupby('subject').sum()['Profit'])
                                

##### VISUALIZATION using Seaborn.

In [None]:
plt.figure(figsize =(15,8))
plt.xticks(rotation = 45)
sns.barplot(data = df,
           x = df.groupby('subject').sum()['Profit'].index,
           y = df.groupby('subject').sum()['Profit'].values,
           palette = 'plasma')


#### B. How many paid an non-paid in each subject? 

In [None]:
df.groupby('subject').sum()['is_paid']

In [None]:
#unpaid courses
df[df['is_paid'] == False]['subject'].value_counts().plot(kind = 'bar')

In [None]:
#Paid courses
df[df['is_paid'] == True]['subject'].value_counts().plot(kind = 'bar')

### 3) Multi-variate Analysis (subject)

#### A. How many courses in each subject according to is_paid?

#### using Plotly

In [None]:
fig = px.bar(data_frame = df,
       x = df['subject'],
     color = df['is_paid'])
fig.update_xaxes(title = 'Subject')
fig.update_yaxes(title = '# Courses')
fig.show()

#### B. How many courses in each subject according to Price_category?

#### using Seaborn

In [None]:
plt.figure(figsize = (18,5))
plt.xticks(rotation = 45)
sns.countplot(data = df, x = df['subject'], hue = df['Price_Category'])

#### C. How many courses in each subject according to Price_category?

#### using Seaborn

In [None]:
plt.figure(figsize = (18,5))
plt.xticks(rotation = 45)
sns.countplot(data = df, x = df['subject'], hue = df['Duration_category'])

#### D. How many courses in each subject according to level?

#### using Seaborn

In [None]:
plt.figure(figsize = (18,5))
plt.xticks(rotation = 45)
sns.countplot(data = df, x = df['subject'], hue = df['level'])

#### E. In each year what is the total profit made by each subject?

In [None]:
df.groupby(['Year','subject'],as_index=False)['Profit'].sum()

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(data=df,x='subject',y='Profit',hue='Year')

In [None]:
px.pie(data_frame=df, names='subject', values='Profit', color='Year')

##### F. Regarding price category, what is the total profit made by each subject?

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(data=df,x='subject',y='Profit',hue='Price_Category')

##### G. Regarding duration category, what is the total profit made by each subject?

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(data=df,x='subject',y='Profit',hue='Duration_category')

##### H. Regarding level, what is the total profit made by each subject?

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(data=df,x='subject',y='Profit',hue='level')

##### I. Regarding each year, how many subscribers in each subject?

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(data=df,x='subject',y='num_subscribers',hue='Year')

##### J. Regarding price category, how many subscribers in each subject?

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(data=df,x='subject',y='num_subscribers',hue='Price_Category')

##### K. Regarding duration category, how many subscribers in each subject?

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(data=df,x='subject',y='num_subscribers',hue='Duration_category')

##### L. Regarding level, how many subscribers in each subject?

In [None]:

plt.figure(figsize=(15,8))
sns.barplot(data=df,x='subject',y='num_subscribers',hue='level')

In [None]:
df['Profit'].describe()

## Time Series Analysis

### published_timestamp

In [None]:
df.sample(2)

In [None]:
df.index

In [None]:
df.loc['2017'].head(2)

In [None]:
df.index.year

In [None]:
df.loc['2017-1-24' : '2017-1-25']

### Profit

##### What is the total profit in each year?

In [None]:
df.resample("Y").sum()['Profit']

In [None]:
plt.figure(figsize=(15,8))
plt.xticks(rotation=45)
sns.barplot(x=df.resample("Y").sum()['Profit'].index.year,
           y=df.resample("Y").sum()['Profit'].values)

In [None]:
df.index.month_name()

In [None]:
df.index.year

In [None]:
plt.figure(figsize=(15,8))
plt.xticks(rotation=45)
sns.lineplot(x = df.loc['2013'].resample("MS").sum()['Profit'].index.month_name(),
            y = df.loc['2013'].resample("MS").sum()['Profit'].values)

In [None]:
df.to_csv('Udemy_Courses_Cleaned.csv')