# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import matplotlib

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
warnings.filterwarnings('ignore')
pal =  ['#FEEFE5', '#F8C8B0', '#E45011', '#87280E','#0F0608']

df = pd.read_csv('https://raw.githubusercontent.com/mchosasih99/dataset/main/Clicked%20Ads%20Dataset.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                1000 non-null   int64  
 1   Daily Time Spent on Site  987 non-null    float64
 2   Age                       1000 non-null   int64  
 3   Area Income               987 non-null    float64
 4   Daily Internet Usage      989 non-null    float64
 5   Male                      997 non-null    object 
 6   Timestamp                 1000 non-null   object 
 7   Clicked on Ad             1000 non-null   object 
 8   city                      1000 non-null   object 
 9   province                  1000 non-null   object 
 10  category                  1000 non-null   object 
dtypes: float64(3), int64(2), object(6)
memory usage: 86.1+ KB


In [3]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage
count,1000.0,987.0,1000.0,987.0,989.0
mean,499.5,64.929524,36.009,384864700.0,179.86362
std,288.819436,15.844699,8.785562,94079990.0,43.870142
min,0.0,32.6,19.0,97975500.0,104.78
25%,249.75,51.27,29.0,328633000.0,138.71
50%,499.5,68.11,35.0,399068300.0,182.65
75%,749.25,78.46,42.0,458355400.0,218.79
max,999.0,91.43,61.0,556393600.0,267.01


In [4]:
df.describe(include='object')

Unnamed: 0,Male,Timestamp,Clicked on Ad,city,province,category
count,997,1000,1000,1000,1000,1000
unique,2,997,2,30,16,10
top,Perempuan,5/26/2016 15:40,No,Surabaya,Daerah Khusus Ibukota Jakarta,Otomotif
freq,518,2,500,64,253,112


In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Timestamp,Clicked on Ad,city,province,category
0,0,68.95,35,432837300.0,256.09,Perempuan,3/27/2016 0:53,No,Jakarta Timur,Daerah Khusus Ibukota Jakarta,Furniture
1,1,80.23,31,479092950.0,193.77,Laki-Laki,4/4/2016 1:39,No,Denpasar,Bali,Food
2,2,69.47,26,418501580.0,236.5,Perempuan,3/13/2016 20:35,No,Surabaya,Jawa Timur,Electronic
3,3,74.15,29,383643260.0,245.89,Laki-Laki,1/10/2016 2:31,No,Batam,Kepulauan Riau,House
4,4,68.37,35,517229930.0,225.58,Perempuan,6/3/2016 3:36,No,Medan,Sumatra Utara,Finance


In [6]:
# Menghapus kolom
df.drop('Unnamed: 0',1,inplace=True)

## Modifikasi kolom datetime

In [7]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df['Date'] = pd.to_datetime(df['Timestamp']).dt.date
df['Time'] = pd.to_datetime(df['Timestamp']).dt.time
df['hari'] = pd.to_datetime(df['Date']).dt.day
df['hari_dalam_minggu'] = pd.to_datetime(df['Date']).dt.day_of_week
df['minggu'] = pd.to_datetime(df['Date']).dt.week
df['bulan'] = pd.to_datetime(df['Date']).dt.month

df.drop(['Timestamp','Date'],1, inplace=True)

In [8]:
df[['jam','menit','detik']] = df['Time'].astype(str).str.split(':', expand=True)
df['jam'] = df['jam'].astype(int)

In [9]:
#Group Labeling Waktu
conditions = [
    df['jam'].isin(list(range(0,4))),
    df['jam'].isin(list(range(4,11))),
    df['jam'].isin(list(range(11,15))),
    df['jam'].isin(list(range(15,19))),
    df['jam'].isin(list(range(19,25)))
]

choices = ['malam', 'pagi', 'siang', 'sore', 'malam']

df['waktu_iklan'] = np.select(conditions, choices)


In [10]:
#Menghapus kolom yang tidak perlu
df.drop(['jam','detik','menit','Time'], 1,inplace=True)

## Mengisi Null Value

In [11]:
df['Daily Time Spent on Site'].fillna(df['Daily Time Spent on Site'].mean(),inplace=True)
df['Male'].fillna('Perempuan',inplace=True)
df['Daily Internet Usage'].fillna(df['Daily Internet Usage'].mean(),inplace=True)

In [12]:
# Mengisi Null Value dengan median tiap city

income_median = df.groupby('city').agg({'Area Income':'median'}).reset_index() #Agregasi median income

df_null = df[df['Area Income'].isnull()] #Mengambil dataframe null

df_fillna = df_null.merge(income_median,
                          on='city',
                          how='left')
df_fillna.drop('Area Income_x',1,inplace=True)
df_fillna.rename(columns=({'Area Income_y': 'Area Income'}),inplace=True)
df_fillna = df_fillna[list(df.columns)] #Membuat dataframe baru

df_clean = df.append(df_fillna) #Append ke dataframe baru
df_clean.dropna(inplace=True) #Delete null value
df_clean = df_clean.reset_index(drop=True) #Reset index

In [13]:
#Merubah Value Target
df_clean['Clicked on Ad'].replace({'Yes':1,'No':0},inplace=True)

In [14]:
#Merubah nama kolom
df_clean.rename(columns={'Male':'jenis_kelamin'},inplace=True)

# Data Preprocessing

## Feature Encoding

In [15]:
df_encode = df_clean.copy()

In [16]:
df_encode.describe(include='object')

Unnamed: 0,jenis_kelamin,city,province,category,waktu_iklan
count,1000,1000,1000,1000,1000
unique,2,30,16,10,4
top,Perempuan,Surabaya,Daerah Khusus Ibukota Jakarta,Otomotif,malam
freq,521,64,253,112,379


City memiliki banyak sekali jumlah unique value, sehingga fitur tersebut akan dihapus untuk mencegah overfitting.

In [17]:
df_encode.drop('city',1,inplace=True)

In [18]:
#Label

df_encode.jenis_kelamin.replace({'Perempuan':0,
                                 'Laki-Laki':1},inplace=True)

In [19]:
#One Hot Encoding
onehot = ['province',
          'category','waktu_iklan']
for i in onehot:
    onehots = pd.get_dummies(df_encode[i], prefix=i)
    df_encode = df_encode.join(onehots)
    
df_encode = df_encode.drop(columns=onehot, axis =1)

## Train Test Split

In [20]:
# Pemisahan Features vs Target

X = df_encode.drop(columns=['Clicked on Ad'], axis=1)
y = df_encode['Clicked on Ad']
print(X.shape)
print(y.shape)

(1000, 39)
(1000,)


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state = 42)

## Feature Transformation

In [22]:
# Fitur transformasi menggunakan log
transform = ['Daily Time Spent on Site', 'Age',
             'Area Income', 'Daily Internet Usage']

X_train.loc[:, transform] = np.log(X_train[transform])
X_test.loc[:, transform] = np.log(X_test[transform])

## Feature Standardization

In [23]:
#Merubah skala fitur menjadi 0,1
from sklearn.preprocessing import MinMaxScaler
ss = MinMaxScaler()

X_train[transform] = ss.fit_transform(X_train[transform].values.reshape(len(X_train),4))

X_test[transform] = ss.fit_transform(X_test[transform].values.reshape(len(X_test),4))

In [24]:
X_train.shape

(670, 39)