## Train Sample Datset

In [None]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer

In [None]:
# Importing dataset
df = pd.read_csv('train_sample.csv')
df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,89489,3,1,13,379,2017-11-06 15:13:23,,0
1,204158,35,1,13,21,2017-11-06 15:41:07,2017-11-07 08:17:19,1
2,3437,6,1,13,459,2017-11-06 15:42:32,,0
3,167543,3,1,13,379,2017-11-06 15:56:17,,0
4,147509,3,1,13,379,2017-11-06 15:57:01,,0


In [None]:
## Getting the basic info about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2300561 entries, 0 to 2300560
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   ip               int64 
 1   app              int64 
 2   device           int64 
 3   os               int64 
 4   channel          int64 
 5   click_time       object
 6   attributed_time  object
 7   is_attributed    int64 
dtypes: int64(6), object(2)
memory usage: 140.4+ MB


In [8]:
## Getting the statistical View of the data
np.round(df.describe(),2)

Unnamed: 0,ip,app,device,os,channel,is_attributed
count,2300561.0,2300561.0,2300561.0,2300561.0,2300561.0,2300561.0
mean,105605.22,15.51,23.48,22.98,256.24,0.2
std,83393.42,21.09,250.19,54.84,129.11,0.4
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,43837.0,3.0,1.0,13.0,137.0,0.0
50%,87498.0,12.0,1.0,18.0,245.0,0.0
75%,138745.0,18.0,1.0,20.0,349.0,0.0
max,364778.0,768.0,4223.0,911.0,498.0,1.0


In [None]:
# Checking the Missing values in a data
df.isnull().sum()

## As we can see from the output only one feature has the missing values

ip                       0
app                      0
device                   0
os                       0
channel                  0
click_time               0
attributed_time    1843715
is_attributed            0
dtype: int64

In [None]:
## Checking the not-null Columns 
df[-df['attributed_time'].isnull()].count()


## Out of 2300561 rows, 456846 are filled remaining 1843715 are Null.

ip                 456846
app                456846
device             456846
os                 456846
channel            456846
click_time         456846
attributed_time    456846
is_attributed      456846
dtype: int64

In [16]:
df['attributed_time']

0                          NaN
1          2017-11-07 08:17:19
2                          NaN
3                          NaN
4                          NaN
                  ...         
2300556                    NaN
2300557                    NaN
2300558                    NaN
2300559                    NaN
2300560                    NaN
Name: attributed_time, Length: 2300561, dtype: object

In [18]:
si = SimpleImputer(strategy= 'most_frequent')

In [24]:
df_new_si = si.fit_transform(df[['attributed_time']])

In [34]:
df_one = pd.DataFrame(df_new_si, columns= ['new_attributed_time'])

In [35]:
df_one

Unnamed: 0,new_attributed_time
0,2017-11-08 14:16:29
1,2017-11-07 08:17:19
2,2017-11-08 14:16:29
3,2017-11-08 14:16:29
4,2017-11-08 14:16:29
...,...
2300556,2017-11-08 14:16:29
2300557,2017-11-08 14:16:29
2300558,2017-11-08 14:16:29
2300559,2017-11-08 14:16:29


In [36]:
df_new = pd.concat((df, df_one), axis= 1)
df_new.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,new_attributed_time
0,89489,3,1,13,379,2017-11-06 15:13:23,,0,2017-11-08 14:16:29
1,204158,35,1,13,21,2017-11-06 15:41:07,2017-11-07 08:17:19,1,2017-11-07 08:17:19
2,3437,6,1,13,459,2017-11-06 15:42:32,,0,2017-11-08 14:16:29
3,167543,3,1,13,379,2017-11-06 15:56:17,,0,2017-11-08 14:16:29
4,147509,3,1,13,379,2017-11-06 15:57:01,,0,2017-11-08 14:16:29


In [37]:
df_new = df_new.drop(columns= ['attributed_time'])

In [38]:
df_new.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,new_attributed_time
0,89489,3,1,13,379,2017-11-06 15:13:23,0,2017-11-08 14:16:29
1,204158,35,1,13,21,2017-11-06 15:41:07,1,2017-11-07 08:17:19
2,3437,6,1,13,459,2017-11-06 15:42:32,0,2017-11-08 14:16:29
3,167543,3,1,13,379,2017-11-06 15:56:17,0,2017-11-08 14:16:29
4,147509,3,1,13,379,2017-11-06 15:57:01,0,2017-11-08 14:16:29
