###### Cleaning data for Tableau

In [1]:
# import libraries
import pandas as pd

In [2]:
# reading data from a zipped csv file
df = pd.read_csv('data_for_tableau.zip', compression='zip')

df

Unnamed: 0,date,event,purchase_sum,os_name,device_id,gender,city,utm_source
0,2020-01-01,app_start,,android,669460,female,Moscow,-
1,2020-01-01,app_start,,ios,833621,male,Moscow,vk_ads
2,2020-01-01,app_start,,android,1579237,male,Saint-Petersburg,referal
3,2020-01-01,app_start,,android,1737182,female,Moscow,facebook_ads
4,2020-01-01,app_start,,ios,4029024,female,Moscow,facebook_ads
...,...,...,...,...,...,...,...,...
2747963,2020-03-31,register,,android,2984778,male,Saint-Petersburg,facebook_ads
2747964,2020-03-31,register,,ios,27301864,male,Moscow,-
2747965,2020-03-31,register,,ios,1294285,female,Saint-Petersburg,google_ads
2747966,2020-03-31,register,,android,3010574,female,Saint-Petersburg,google_ads


In [3]:
# checking min date
df['date'].min()

'2020-01-01'

In [4]:
# checking max date
df['date'].max()

'2020-03-31'

In [5]:
# sorting values and dropping duplicates to create cohorts
mindate = df.sort_values('date').drop_duplicates('device_id')

mindate

Unnamed: 0,date,event,purchase_sum,os_name,device_id,gender,city,utm_source
0,2020-01-01,app_start,,android,669460,female,Moscow,-
13192,2020-01-01,tap_basket,,android,17289661,male,Saint-Petersburg,vk_ads
13191,2020-01-01,tap_basket,,android,12215118,male,Saint-Petersburg,facebook_ads
13190,2020-01-01,tap_basket,,ios,9163079,male,Moscow,vk_ads
13189,2020-01-01,tap_basket,,android,1948894,female,Saint-Petersburg,google_ads
...,...,...,...,...,...,...,...,...
2497656,2020-03-31,app_start,,ios,5682834,female,Moscow,-
2499467,2020-03-31,app_start,,android,9339041,male,Saint-Petersburg,-
2499780,2020-03-31,app_start,,ios,32346202,female,Moscow,yandex-direct
2498997,2020-03-31,app_start,,ios,23772077,male,Moscow,vk_ads


In [6]:
# checking that we haven't lost any data
df['device_id'].nunique()

190884

In [7]:
# slicing
mindate = mindate[['device_id', 'date']]

In [8]:
# renaming columns
mindate = mindate.rename(columns={'date': 'cohort_first_session'})

mindate

Unnamed: 0,device_id,cohort_first_session
0,669460,2020-01-01
13192,17289661,2020-01-01
13191,12215118,2020-01-01
13190,9163079,2020-01-01
13189,1948894,2020-01-01
...,...,...
2497656,5682834,2020-03-31
2499467,9339041,2020-03-31
2499780,32346202,2020-03-31
2498997,23772077,2020-03-31


In [9]:
# merging cohorts data with the main dataframe
df = df.merge(mindate, how='left', on='device_id')
df

Unnamed: 0,date,event,purchase_sum,os_name,device_id,gender,city,utm_source,cohort_first_session
0,2020-01-01,app_start,,android,669460,female,Moscow,-,2020-01-01
1,2020-01-01,app_start,,ios,833621,male,Moscow,vk_ads,2020-01-01
2,2020-01-01,app_start,,android,1579237,male,Saint-Petersburg,referal,2020-01-01
3,2020-01-01,app_start,,android,1737182,female,Moscow,facebook_ads,2020-01-01
4,2020-01-01,app_start,,ios,4029024,female,Moscow,facebook_ads,2020-01-01
...,...,...,...,...,...,...,...,...,...
2747963,2020-03-31,register,,android,2984778,male,Saint-Petersburg,facebook_ads,2020-03-28
2747964,2020-03-31,register,,ios,27301864,male,Moscow,-,2020-03-31
2747965,2020-03-31,register,,ios,1294285,female,Saint-Petersburg,google_ads,2020-03-31
2747966,2020-03-31,register,,android,3010574,female,Saint-Petersburg,google_ads,2020-03-06


In [11]:
# exporting results to a csv file
df.to_csv('data_for_analysis.csv', index=False)

print('Data export is successfully finished!')

Data export is successfully finished!
