# 📊 Dataset Information

**Note:** The dataset for this project is **may not included in this repository** to keep the repository lightweight.  

You can download the dataset from **Kaggle** using the link below:  

[Download the Bi_Intro Dataset](https://www.kaggle.com/datasets/walekhwatlphilip/intro-to-data-cleaning-eda-and-machine-learning)  

Please make sure to place the downloaded CSV file in the same directory as this notebook before running any code.


# Importing

## Import Libraray

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Import CSV And convert to DataFrame

In [2]:
df = pd.read_csv('bi.csv', encoding='latin1')

# PreProcessing

## Details of Dataset

### Frist five row

In [3]:
df.head(20)

Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB
0,Christina,Binger,44,Female,Norway,Private,72,Masters,158,59.0,55
1,Alex,Walekhwa,60,M,Kenya,Private,79,Diploma,150,60.0,75
2,Philip,Leo,25,Male,Uganda,Sognsvann,55,HighSchool,130,74.0,50
3,Shoni,Hlongwane,24,F,Rsa,Sognsvann,40,High School,120,,44
4,Maria,Kedibone,23,Female,South Africa,Sognsvann,65,High School,122,91.0,80
5,Hannah,Hansen,25,female,Norge,BI Residence,66,High School,130,88.0,59
6,Ole,Johansen,27,Male,Norway,BI-Residence,90,Bachelors,156,80.0,91
7,Lars,Olsen,29,Male,norway,BIResidence,89,Barrrchelors,160,85.0,60
8,Bjørn,Larsen,31,Male,Norway,BI Residence,88,Bachelors,156,80.0,89
9,Sofie,Jensen,33,Female,Denmark,BI_Residence,85,Bachelors,160,83.0,90


### last Five row

In [4]:
df.tail()

Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB
72,Clara,Bernard,43,Female,France,Private,80,Bachelors,150,75.0,43
73,Julian,Nielsen,31,Male,Denmark,Sognsvann,90,Masters,158,84.0,83
74,Sophie,Brown,33,Female,UK,Sognsvann,96,Masters,158,85.0,90
75,Leon,Bauer,35,Male,Germany,Sognsvann,90,Masters,160,87.0,74
76,Mohammed,Salim,31,Male,Somali,Sognsvann,35,Masters,144,72.0,90


### Shape of our dataset

In [5]:
df.shape

(77, 11)

### List out all columns

In [6]:
df.columns

Index(['fNAME', 'lNAME', 'Age', 'gender', 'country', 'residence', 'entryEXAM',
       'prevEducation', 'studyHOURS', 'Python', 'DB'],
      dtype='object')

### Datatype of each columns

In [7]:
df.dtypes

fNAME             object
lNAME             object
Age                int64
gender            object
country           object
residence         object
entryEXAM          int64
prevEducation     object
studyHOURS         int64
Python           float64
DB                 int64
dtype: object

### Information of all Columns

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fNAME          77 non-null     object 
 1   lNAME          77 non-null     object 
 2   Age            77 non-null     int64  
 3   gender         77 non-null     object 
 4   country        77 non-null     object 
 5   residence      77 non-null     object 
 6   entryEXAM      77 non-null     int64  
 7   prevEducation  77 non-null     object 
 8   studyHOURS     77 non-null     int64  
 9   Python         75 non-null     float64
 10  DB             77 non-null     int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 6.7+ KB


### Check Null Value

In [9]:
df.isnull().sum()

fNAME            0
lNAME            0
Age              0
gender           0
country          0
residence        0
entryEXAM        0
prevEducation    0
studyHOURS       0
Python           2
DB               0
dtype: int64

### Handle Null value

In [10]:
df['Python'] = df['Python'].fillna(df['Python'].mean())
df.isnull().sum()

fNAME            0
lNAME            0
Age              0
gender           0
country          0
residence        0
entryEXAM        0
prevEducation    0
studyHOURS       0
Python           0
DB               0
dtype: int64

### Check Duplicate Value

In [11]:
df.duplicated().sum()

np.int64(0)

### summary of the dataset

In [12]:
df.describe()

Unnamed: 0,Age,entryEXAM,studyHOURS,Python,DB
count,77.0,77.0,77.0,77.0,77.0
mean,35.233766,76.753247,149.714286,75.853333,69.467532
std,10.310822,16.475784,12.743272,15.206208,17.033701
min,21.0,28.0,114.0,15.0,30.0
25%,27.0,69.0,144.0,72.0,56.0
50%,33.0,80.0,156.0,81.0,71.0
75%,42.0,90.0,158.0,85.0,83.0
max,71.0,98.0,160.0,91.0,100.0


## Data Cleaning

### Clean Column Names

In [13]:
df.columns

Index(['fNAME', 'lNAME', 'Age', 'gender', 'country', 'residence', 'entryEXAM',
       'prevEducation', 'studyHOURS', 'Python', 'DB'],
      dtype='object')

In [14]:
import re

def clean_column(name):
    words = re.findall(r'[A-Z]?[a-z]+|[A-Z]+(?![a-z])', name)
    return ''.join([w.capitalize() for w in words])

df.columns = [clean_column(col) for col in df.columns]

df.columns

Index(['FName', 'LName', 'Age', 'Gender', 'Country', 'Residence', 'EntryExam',
       'PrevEducation', 'StudyHours', 'Python', 'Db'],
      dtype='object')

### Standardize Country Names

In [15]:
df['Country'].unique()

array(['Norway', 'Kenya', 'Uganda', 'Rsa', 'South Africa', 'Norge',
       'norway', 'Denmark', 'Netherlands', 'Italy', 'Spain', 'UK',
       'Somali', 'Nigeria', 'Germany', 'France'], dtype=object)

In [16]:
country_mapping  = {
    'Rsa': 'Russia',
    'Norge': 'Norway',
    'norway': 'Norway',
    'UK': 'United Kingdom',
    'Somali': 'Somalia'
}

df["Country"] = df["Country"].replace(country_mapping)
df['Country'].unique()

array(['Norway', 'Kenya', 'Uganda', 'Russia', 'South Africa', 'Denmark',
       'Netherlands', 'Italy', 'Spain', 'United Kingdom', 'Somalia',
       'Nigeria', 'Germany', 'France'], dtype=object)

### Standardize Education Leavl

In [17]:
df['PrevEducation'].unique()

array(['Masters', 'Diploma', 'HighSchool', 'High School', 'Bachelors',
       'Barrrchelors', 'diploma', 'DIPLOMA', 'Diplomaaa', 'Doctorate'],
      dtype=object)

In [18]:
education_mapping = {
    'HighSchool': 'High School',
    'High School': 'High School',
    'Bachelors': 'Bachelors',
    'Barrrchelors': 'Bachelors',
    'Masters': 'Masters',
    'Diploma': 'Diploma',
    'diploma': 'Diploma',
    'DIPLOMA': 'Diploma',
    'Diplomaaa': 'Diploma',
    'Doctorate': 'Doctorate'
}
df['PrevEducation'] = df['PrevEducation'].replace(education_mapping)
print(df['PrevEducation'].unique())

['Masters' 'Diploma' 'High School' 'Bachelors' 'Doctorate']


### Standardize Gender Values

In [19]:
df['Gender'].unique()

array(['Female', 'M', 'Male', 'F', 'female', 'male'], dtype=object)

In [20]:
gender_mapping = {
    'F': 'Female',
    'Female': 'Female',
    'female': 'Female',
    'M': 'Male',
    'Male': 'Male',
    'male': 'Male'
}

df['Gender'] = df['Gender'].replace(gender_mapping)

df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

### Standardize Residence Values

In [21]:
df['Residence'].unique()

array(['Private', 'Sognsvann', 'BI Residence', 'BI-Residence',
       'BIResidence', 'BI_Residence'], dtype=object)

In [22]:
residence_mapping = {
    'Private': 'Private',
    'Sognsvann': 'Sognsvann',
    'BI Residence': 'BI Residence',
    'BI-Residence': 'BI Residence',
    'BIResidence': 'BI Residence',
    'BI_Residence': 'BI Residence'
}
df['Residence'] = df['Residence'].replace(residence_mapping)
print(df['Residence'].unique())

['Private' 'Sognsvann' 'BI Residence']


### Full Name Column

In [23]:
df.insert(0, 'Name', df['FName'] + ' ' + df['LName'])

df = df.drop(columns=['FName', 'LName'])

df['Name']

0     Christina Binger
1        Alex Walekhwa
2           Philip Leo
3      Shoni Hlongwane
4       Maria Kedibone
            ...       
72       Clara Bernard
73      Julian Nielsen
74        Sophie Brown
75          Leon Bauer
76      Mohammed Salim
Name: Name, Length: 77, dtype: object

### Needed Columns formatted to two decimal places

In [24]:
df['EntryExam'] = df['EntryExam'].map('{:.2f}'.format)
df['StudyHours'] = df['StudyHours'].map('{:.2f}'.format)
df['Python'] = df['Python'].map('{:.2f}'.format)
df['Db'] = df['Db'].map('{:.2f}'.format)

# Final Dataset

## First 5 row

In [25]:
df.head()

Unnamed: 0,Name,Age,Gender,Country,Residence,EntryExam,PrevEducation,StudyHours,Python,Db
0,Christina Binger,44,Female,Norway,Private,72.0,Masters,158.0,59.0,55.0
1,Alex Walekhwa,60,Male,Kenya,Private,79.0,Diploma,150.0,60.0,75.0
2,Philip Leo,25,Male,Uganda,Sognsvann,55.0,High School,130.0,74.0,50.0
3,Shoni Hlongwane,24,Female,Russia,Sognsvann,40.0,High School,120.0,75.85,44.0
4,Maria Kedibone,23,Female,South Africa,Sognsvann,65.0,High School,122.0,91.0,80.0


## Last 5 row

In [26]:
df.tail()

Unnamed: 0,Name,Age,Gender,Country,Residence,EntryExam,PrevEducation,StudyHours,Python,Db
72,Clara Bernard,43,Female,France,Private,80.0,Bachelors,150.0,75.0,43.0
73,Julian Nielsen,31,Male,Denmark,Sognsvann,90.0,Masters,158.0,84.0,83.0
74,Sophie Brown,33,Female,United Kingdom,Sognsvann,96.0,Masters,158.0,85.0,90.0
75,Leon Bauer,35,Male,Germany,Sognsvann,90.0,Masters,160.0,87.0,74.0
76,Mohammed Salim,31,Male,Somalia,Sognsvann,35.0,Masters,144.0,72.0,90.0
