# Import relevant modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import data

In [2]:
df = pd.read_csv('marketing_data-28347077.csv')
df.head()

Unnamed: 0,registration_date,traffic_type,id_partner,age_group,device,country_tier,registrations,users_acqusition_costs,revenue_7_days,logins_1_lt,logins_3_lt,logins_7_lt
0,2021-06-24,Google Search,1,35-,iOS,Tier_1,60,220.0,2.99,5,3,1
1,2021-09-13,Google Search,1,35-,MacOS,Tier_1,8,0.0,0.0,0,0,0
2,2021-07-12,Google Search,1,35-55,Android,Tier_1,121,2550.0,225.97,28,19,11
3,2021-03-26,Google Search,1,35-55,iOS,Tier_1,95,1900.0,351.92,23,9,2
4,2021-04-26,Google Search,1,35-,Android,Tier_1,51,915.0,256.09,7,2,2


# Data preprocessing

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48248 entries, 0 to 48247
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   registration_date       48248 non-null  object 
 1   traffic_type            48248 non-null  object 
 2   id_partner              48248 non-null  int64  
 3   age_group               48248 non-null  object 
 4   device                  48248 non-null  object 
 5   country_tier            48248 non-null  object 
 6   registrations           48248 non-null  int64  
 7   users_acqusition_costs  48248 non-null  float64
 8   revenue_7_days          48248 non-null  float64
 9   logins_1_lt             48248 non-null  int64  
 10  logins_3_lt             48248 non-null  int64  
 11  logins_7_lt             48248 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 4.4+ MB


## Data cleaning

### Check missing values

In [3]:
df.isnull().values.any()

True

In [5]:
missing_values_count = df.isnull().sum()
total_missing = missing_values_count.sum() 

print(f'Missing : {total_missing}')

Missing : 2


Lets find out where the missing values are.

In [6]:
columns_with_missing = df.isna().any()

columns_with_missing

registration_date         False
traffic_type              False
id_partner                False
age_group                  True
device                    False
country_tier              False
registrations             False
users_acqusition_costs    False
revenue_7_days            False
logins_1_lt               False
logins_3_lt               False
logins_7_lt               False
dtype: bool


### Fill missing values
Since age_group is basicly a numeric type (not exactly, but its a range of numeric values) we can fill missing values with zeros.

In [7]:
df = df.fillna(0)
df.isnull().values.any()

False

## Data transformation

### Dates

In [11]:
df['registration_date'] = pd.to_datetime(df['registration_date'])

### traffic_type transformation
Since we know that there are only 3 values for this column we can safely change its value to some numeric key.

In [14]:
unique_values = df['traffic_type'].unique()
unique_values

array(['Google Search', 'Facebook', 'Google Media'], dtype=object)

Replace values.

In [16]:
df['traffic_type'] = df['traffic_type'].replace('Google Search', 0)
df['traffic_type'] = df['traffic_type'].replace('Facebook', 1)
df['traffic_type'] = df['traffic_type'].replace('Google Media', 2)
df

Unnamed: 0,registration_date,traffic_type,id_partner,age_group,device,country_tier,registrations,users_acqusition_costs,revenue_7_days,logins_1_lt,logins_3_lt,logins_7_lt
0,2021-06-24,0,1,35-,iOS,Tier_1,60,220.0,2.99,5,3,1
1,2021-09-13,0,1,35-,MacOS,Tier_1,8,0.0,0.00,0,0,0
2,2021-07-12,0,1,35-55,Android,Tier_1,121,2550.0,225.97,28,19,11
3,2021-03-26,0,1,35-55,iOS,Tier_1,95,1900.0,351.92,23,9,2
4,2021-04-26,0,1,35-,Android,Tier_1,51,915.0,256.09,7,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
48243,2021-01-09,2,6,35-55,Other,Tier_1,15,90.0,0.00,1,1,1
48244,2021-07-19,2,6,55+,Other,Tier_1,23,65.5,0.00,5,0,2
48245,2021-02-26,2,6,55+,Other,Tier_1,29,194.0,0.00,2,3,2
48246,2021-09-22,2,6,35-,Android,Tier_3,6,0.0,0.00,0,0,0


# Data visualization