## Step 1: Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Load dataset
df = pd.read_csv("data/bw_courses - Sheet1.csv")

# Display basic info
df.head()

Unnamed: 0,Course No,Course Title,Course Description,Released Languages,Who This Course is For
0,1,Course on Financial Freedom,Achieve financial freedom by creating a budget...,6711202124,"Individuals of all ages, from young adults jus..."
1,2,Mutual Funds Course,Invest in mutual funds easily and reap the ben...,6711202124,Beginner investors looking to learn about mutu...
2,3,Course on Credit Cards,Utilize the benefits of credit cards by obtain...,6711202124,Adults who are new to credit cards and want to...
3,4,Course on Credit Score,"Elevate your credit score, elevate your life w...",6711202124,Individuals seeking to understand their credit...
4,5,Course on Stock Market,Transform yourself into a Savvy Stock Market I...,6711202124,"A beginner's guide to the stock market, this c..."


## Step 2: Dataset Overview

In [4]:
# Shape of dataset
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

Rows: 100
Columns: 5


In [5]:
# Column names
print("Columns:", df.columns.tolist())

Columns: ['Course No', 'Course Title', 'Course Description', 'Released Languages', 'Who This Course is For']


In [6]:
# Data types & nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Course No               100 non-null    int64 
 1   Course Title            100 non-null    object
 2   Course Description      100 non-null    object
 3   Released Languages      100 non-null    object
 4   Who This Course is For  98 non-null     object
dtypes: int64(1), object(4)
memory usage: 4.0+ KB


In [7]:
# Missing values count
df.isnull().sum()

Course No                 0
Course Title              0
Course Description        0
Released Languages        0
Who This Course is For    2
dtype: int64

In [9]:
# Locate rows with missing values in "Who This Course is For"
missing_rows = df[df['Who This Course is For'].isnull()]

# Display them
missing_rows

Unnamed: 0,Course No,Course Title,Course Description,Released Languages,Who This Course is For
67,68,Course on Learning Yoga from Home,"Transform your mind, body, and soul with the b...",24,
83,84,Pomegranate Farming Course,Earn Substantial Income Per Acre With Pomegran...,21,


Only Who This Course is For has 2 missing values. The best practice is to fill with "Not specified" instead of dropping (so we can avoid losing course data).

In [10]:
df['Who This Course is For'] = df['Who This Course is For'].fillna("Not specified")

In [12]:
# Display only the 2 rows where missing values existed before
fixed_rows = df[df['Who This Course is For'] == "Not specified"]

fixed_rows

Unnamed: 0,Course No,Course Title,Course Description,Released Languages,Who This Course is For
67,68,Course on Learning Yoga from Home,"Transform your mind, body, and soul with the b...",24,Not specified
83,84,Pomegranate Farming Course,Earn Substantial Income Per Acre With Pomegran...,21,Not specified


## Step 3: Uniqueness & duplicates

In [8]:
# Check duplicates
print("Duplicates:", df.duplicated().sum())

# Unique counts
df.nunique()

Duplicates: 0


Course No                 100
Course Title              100
Course Description        100
Released Languages         16
Who This Course is For     98
dtype: int64