In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(0, '../scripts/')

Import Packages

In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from data_cleaner import DataCleaner
%matplotlib inline

Initalize configs

In [4]:
pwd = Path().cwd()
root_dir = pwd.parent
data_dir = root_dir / "data"
file_path = data_dir / "Week1_challenge_data_source(CSV).csv"

In [5]:
df = pd.read_csv(file_path, na_values=["undefined"], keep_default_na=True, parse_dates=["Start", "End"])

In [6]:
df.head()

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,1.311448e+19,2019-04-04 12:01:00,770.0,2019-04-25 14:35:00,662.0,1823652.0,208201400000000.0,33664960000.0,35521210000000.0,9.16456699548519E+015,...,15854611.0,2501332.0,8198936.0,9656251.0,278082303.0,14344150.0,171744450.0,8814393.0,36749741.0,308879636.0
1,1.311448e+19,2019-04-09 13:04:00,235.0,2019-04-25 08:15:00,606.0,1365104.0,208201900000000.0,33681850000.0,35794010000000.0,L77566A,...,20247395.0,19111729.0,18338413.0,17227132.0,608750074.0,1170709.0,526904238.0,15055145.0,53800391.0,653384965.0
2,1.311448e+19,2019-04-09 17:42:00,1.0,2019-04-25 11:58:00,652.0,1361762.0,208200300000000.0,33760630000.0,35281510000000.0,D42335A,...,19725661.0,14699576.0,17587794.0,6163408.0,229584621.0,395630.0,410692588.0,4215763.0,27883638.0,279807335.0
3,1.311448e+19,2019-04-10 00:31:00,486.0,2019-04-25 07:36:00,171.0,1321509.0,208201400000000.0,33750340000.0,35356610000000.0,T21824A,...,21388122.0,15146643.0,13994646.0,1097942.0,799538153.0,10849722.0,749039933.0,12797283.0,43324218.0,846028530.0
4,1.311448e+19,2019-04-12 20:10:00,565.0,2019-04-25 10:40:00,954.0,1089009.0,208201400000000.0,33699800000.0,35407010000000.0,D88865A,...,15259380.0,18962873.0,17124581.0,415218.0,527707248.0,3529801.0,550709500.0,13910322.0,38542814.0,569138589.0


In [8]:
cleaner = DataCleaner()
cleaner.missing_percentage(df)

Unnamed: 0,column_name,percent_missing
0,Bearer Id,0.660662
1,Start,0.000667
2,Start ms,0.000667
3,End,0.000667
4,End ms,0.000667
5,Dur. (ms),0.000667
6,IMSI,0.379997
7,MSISDN/Number,0.710662
8,IMEI,0.381331
9,Last Location Name,0.768662


## Cleaning 
---

- remove duplicates
- fill missing values
- remove irrelevant missing values
- change to appropriate datatypes

In [9]:
clean_data = df.copy()

In [10]:
clean_data.shape

(150001, 55)

In [11]:
clean_data.dropna(subset=["Bearer Id"], inplace=True)

In [12]:
clean_data.dropna(subset=["MSISDN/Number"], inplace=True)

In [13]:
clean_data["Handset Type"] = clean_data["Handset Type"].fillna("Unknown")
clean_data["Handset Manufacturer"] = clean_data["Handset Manufacturer"].fillna("Unknown")
clean_data["Last Location Name"] = clean_data["Last Location Name"].fillna("Unknown")

In [14]:
cleaner.missing_percentage(clean_data)

Unnamed: 0,column_name,percent_missing
0,Bearer Id,0.0
1,Start,0.0
2,Start ms,0.0
3,End,0.0
4,End ms,0.0
5,Dur. (ms),0.0
6,IMSI,0.0
7,MSISDN/Number,0.0
8,IMEI,0.0
9,Last Location Name,0.0


In [19]:
clean_data.fillna(clean_data.mean(numeric_only=True).round(), inplace=True)

In [20]:
cleaner.missing_percentage(clean_data)

Unnamed: 0,column_name,percent_missing
0,Bearer Id,0.0
1,Start,0.0
2,Start ms,0.0
3,End,0.0
4,End ms,0.0
5,Dur. (ms),0.0
6,IMSI,0.0
7,MSISDN/Number,0.0
8,IMEI,0.0
9,Last Location Name,0.0


In [21]:
# change columns to string to avoid numerical calculations on them
clean_data[["IMSI", "MSISDN/Number", "IMEI", "Last Location Name"]] = clean_data[["IMSI", "MSISDN/Number", "IMEI", "Last Location Name"]].astype(str)

In [22]:
clean_data.shape

(148506, 55)

In [23]:
# fix column naming issue on dataset
clean_data.rename(columns={"Dur. (ms)": "Dur. (s)", "Dur. (ms).1": "Dur. (ms)"}, inplace=True)

In [24]:
clean_data.head()

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (s),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,1.311448e+19,2019-04-04 12:01:00,770.0,2019-04-25 14:35:00,662.0,1823652.0,208201448079117.0,33664962239.0,35521209507511.0,9.16456699548519E+015,...,15854611.0,2501332.0,8198936.0,9656251.0,278082303.0,14344150.0,171744450.0,8814393.0,36749741.0,308879636.0
1,1.311448e+19,2019-04-09 13:04:00,235.0,2019-04-25 08:15:00,606.0,1365104.0,208201909211140.0,33681854413.0,35794009006359.0,L77566A,...,20247395.0,19111729.0,18338413.0,17227132.0,608750074.0,1170709.0,526904238.0,15055145.0,53800391.0,653384965.0
2,1.311448e+19,2019-04-09 17:42:00,1.0,2019-04-25 11:58:00,652.0,1361762.0,208200314458056.0,33760627129.0,35281510359387.0,D42335A,...,19725661.0,14699576.0,17587794.0,6163408.0,229584621.0,395630.0,410692588.0,4215763.0,27883638.0,279807335.0
3,1.311448e+19,2019-04-10 00:31:00,486.0,2019-04-25 07:36:00,171.0,1321509.0,208201402342131.0,33750343200.0,35356610164913.0,T21824A,...,21388122.0,15146643.0,13994646.0,1097942.0,799538153.0,10849722.0,749039933.0,12797283.0,43324218.0,846028530.0
4,1.311448e+19,2019-04-12 20:10:00,565.0,2019-04-25 10:40:00,954.0,1089009.0,208201401415120.0,33699795932.0,35407009745539.0,D88865A,...,15259380.0,18962873.0,17124581.0,415218.0,527707248.0,3529801.0,550709500.0,13910322.0,38542814.0,569138589.0


In [25]:
# checking for entire row of duplicates
clean_data.duplicated().sum()

0

In [26]:
clean_data.to_csv(data_dir / "clean_data.csv", index=False)