# Preprocessing sessions

In [1]:
import pandas as pd
import numpy as np
import os
from IPython.display import Image
from IPython.core.display import HTML 
import matplotlib.pyplot as plt  
% matplotlib inline
import random
from datetime import datetime
import seaborn as sns

In [79]:
df = pd.read_csv("sessions.csv")
df.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


# Are there any Missing data ?
As we can see, there are some missing user_id. Without a *user_id*, it is impossible to link them with the file *train_user.csv*.We will delete them as we cannot do anything with them. Indeed, 

In [94]:
df.isnull().any()

user_id           True
action            True
action_type       True
action_detail     True
device_type      False
secs_elapsed      True
dtype: bool

In [98]:
df2 = df.dropna(subset=['user_id'])
removed = round(100-len(df2)/len(df)*100,2)
print(removed, '% have been removed from the original dataframe')
df2.isnull().any()

0.33 % have been removed from the original dataframe


user_id          False
action            True
action_type       True
action_detail     True
device_type      False
secs_elapsed      True
dtype: bool

## Delete users with invalid age from file *train_user.csv*
In the file train_user.csv, we found out that some users had an invalid age. We exported their id, so that we could remove their entries in the file *sessions.csv*

In [105]:
#Load file
df_invalid_age_user_id = pd.read_csv("invalid_age_user_id.csv")

#Rename column so that both dataframe have the same name for user_id
df_invalid_age_user_id=df_invalid_age_user_id.rename(columns = {'id':'user_id'})

print('There are ',len(df_invalid_age_user_id), 'invalid user_id')
df_invalid_age_user_id.head()

There are  2619 invalid user_id


Unnamed: 0,user_id
0,3qsa4lo7eg
1,v2x0ms9c62
2,9ouah6tc30
3,rzhouzy2ok
4,dc3udjfdij


In [111]:
#Common usr_id in both dataframe
common = pd.merge(df2, df_invalid_age_user_id, how = 'right')

#Remove the user_id that are common in both dataframe (hence invalid)
df2=df2[(~df2['user_id'].isin(common['user_id']))]

removed = round(100-len(df2)/len(df)*100,2)
print('In total, ',removed, '% have been removed from the original dataframe')

In total,  0.92 % have been removed from the original dataframe


Unnamed: 0,secs_elapsed
count,10431710.0
mean,19405.81
std,88884.24
min,0.0
25%,229.0
50%,1147.0
75%,8444.0
max,1799977.0
