### Importando bibliotecas e dados

In [366]:
import pandas as pd
import plotly.express as px

In [367]:
costs_us = pd.read_csv('Files\costs_us.csv',parse_dates=['dt'])
orders_log = pd.read_csv('Files\orders_log_us.csv',parse_dates = ['Buy Ts'])
visits_log_us = pd.read_csv('Files\\visits_log_us.csv',parse_dates = ['End Ts','Start Ts'],dtype={'Device': 'category','Source Id': 'category'})

### Checking for null and duplicated values

In [368]:
costs_us.info()
# Sem valores ausentes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2542 entries, 0 to 2541
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   source_id  2542 non-null   int64         
 1   dt         2542 non-null   datetime64[ns]
 2   costs      2542 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 59.7 KB


In [369]:
orders_log.info()
# Sem valores ausentes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50415 entries, 0 to 50414
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Buy Ts   50415 non-null  datetime64[ns]
 1   Revenue  50415 non-null  float64       
 2   Uid      50415 non-null  uint64        
dtypes: datetime64[ns](1), float64(1), uint64(1)
memory usage: 1.2 MB


In [370]:
visits_log_us.info()
#Sem valores ausentes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359400 entries, 0 to 359399
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Device     359400 non-null  category      
 1   End Ts     359400 non-null  datetime64[ns]
 2   Source Id  359400 non-null  category      
 3   Start Ts   359400 non-null  datetime64[ns]
 4   Uid        359400 non-null  uint64        
dtypes: category(2), datetime64[ns](2), uint64(1)
memory usage: 8.9 MB


In [371]:
print('Duplicated values - cost_us: ', costs_us.duplicated().sum())
print('Duplicated values - orders_log: ', orders_log.duplicated().sum())
print('Duplicated values - visits_log_us: ', visits_log_us.duplicated().sum())

Duplicated values - cost_us:  0
Duplicated values - orders_log:  0


Duplicated values - visits_log_us:  0


### 1st step

In [372]:
print(costs_us.sample())
print('\n')
print(orders_log.sample())
print('\n')
print(visits_log_us.sample())

      source_id         dt  costs
2213         10 2017-07-05  13.47


                   Buy Ts  Revenue                  Uid
25009 2017-12-20 09:17:00     4.58  7143013043156307954


       Device              End Ts Source Id            Start Ts  \
143496  touch 2018-01-17 00:23:00         5 2018-01-17 00:21:00   

                         Uid  
143496  10062804366910879619  


In [373]:
# Creating a date column without the time

visits_log_us['duration (s)'] = (visits_log_us['End Ts'] - visits_log_us['Start Ts']).dt.total_seconds().round(2)
visits_log_us['Day'] = visits_log_us['Start Ts'].dt.date

visits_log_us.sample()

Unnamed: 0,Device,End Ts,Source Id,Start Ts,Uid,duration (s),Day
314537,desktop,2018-04-26 09:56:00,3,2018-04-26 09:39:00,6831809367782142444,1020.0,2018-04-26


In [374]:
# Creating a new DataFrame with 'Day' columns and all the unique user logs for each day.
# Followed by a change in the type of 'Day' column to datetime

visit_users_by = visits_log_us.groupby(['Day']).agg({'Uid':'nunique'}).reset_index()
visit_users_by['Day'] = visit_users_by['Day'].astype('datetime64[D]')
#visit_users_by.info()

In [375]:
#Creation of columns 'week'and 'month

visit_users_by['Week'] = visit_users_by['Day'].dt.isocalendar().week
visit_users_by['Month'] = visit_users_by['Day'].dt.month
#visit_users_by.sample()


In [376]:
visit_users_by_day = visit_users_by.groupby(['Day']).agg({'Uid':'sum'}).reset_index()
visit_users_by_day.columns = ['Start_day','Total_users']
#visit_users_by_day.head()

In [377]:
visit_users_by_week = visit_users_by.groupby(['Week']).agg({'Uid':'sum'}).reset_index()
visit_users_by_week.columns = ['Start_week','Total_users']

#visit_users_by_week.head()

In [378]:
visit_users_by_month = visit_users_by.groupby(['Month']).agg({'Uid':'sum'}).reset_index()
visit_users_by_month.columns = ['Start_month','Total_users']
#visit_users_by_month.head()

In [379]:
month_dict = pd.DataFrame({
    'number':[1,2,3,4,5,6,7,8,9,10,11,12],
    'month': ['January', 'February', 'March', 'April', 'May', 'June','July', 'August', 'September', 'October', 'November', 'December']
    }
)

In [380]:
visit_users_by_month = visit_users_by_month.merge(month_dict, left_on = 'Start_month',right_on= 'number').drop('number', axis=1)
visit_users_by_month.head()

Unnamed: 0,Start_month,Total_users,month
0,1,34002,January
1,2,34080,February
2,3,32633,March
3,4,24524,April
4,5,24380,May


#### Checking data output

In [381]:
print('total users by month: ',visit_users_by_month['Total_users'].sum())
print('total users by week: ',visit_users_by_week['Total_users'].sum())
print('total users by day: ',visit_users_by_day['Total_users'].sum())
print('total user: ',visits_log_us['Uid'].nunique())

total users by month:  330509
total users by week:  330509
total users by day:  330509
total user:  228169


### Graphic analysis - Count of unique users per day, week and month

In [382]:
visit_users_by_month_fig = px.bar(visit_users_by_month, x = 'month',y= 'Total_users')
visit_users_by_month_fig.update_layout(title = 'Total unique users per month',xaxis_title = 'Month',yaxis_title = 'Total unique users')
mean_value = visit_users_by_month['Total_users'].mean()
visit_users_by_month_fig.add_hline(y=mean_value,line_dash='dash', line_color='black')
visit_users_by_month_fig.update_traces(texttemplate= '%{y}', textposition='outside')


The information above shows us the total of unique users that logged at the platform. We can see a high decrease during the months of June, July and August. It is intereting to look further on reasons why this could be happening, I would be nice to compare this data with the costs dataset, where we can see all our expenses with marketing.

In [383]:
visit_users_by_day_fig = px.histogram(visit_users_by_day, x = 'Total_users')
visit_users_by_day_fig.update_layout(title = 'Total frequency of unique users per day',xaxis_title = 'Total users range',yaxis_title = 'Frequency')
visit_users_by_day_fig.update_traces(texttemplate= '%{y}', textposition='outside')


In [384]:
visit_users_by_day_box = px.box(visit_users_by_day, x = 'Total_users')
visit_users_by_day_box.update_layout(title = 'Unique users per day - Boxplot', xaxis_title = 'Total users')
visit_users_by_day_box.update_traces(boxmean=True)
visit_users_by_day_box

In the graphs above we can identify the following:

- The lowest amount of user at the platform was 1;
- Our median is at 920 users;
- Having more than 1200 users a day is uncommon;
- Our highest amount of users at the platform was 3319 (which it make us wonder what happened that day);
- Our mean and median values are very similar (907.99 and 921)

In [385]:
visit_users_by_week_fig = px.bar(visit_users_by_week, x = 'Start_week',y = 'Total_users')
visit_users_by_week_fig.update_layout(title = 'Total unique user by week', xaxis_title = 'Week', yaxis_title = 'Total')
mean_value = visit_users_by_week['Total_users'].mean()
visit_users_by_week_fig.add_hline(y=mean_value,line_dash='dash', line_color='black')
visit_users_by_week_fig

Based on the graph above we can infer the following conclusions:
- From the week 23-38, it was the longest period of time where the platform reached numbers below the average;
- Weeks between 23-38 are related to the period between June and September, confirming the information idetified before, but we can still identify reasons for the decrease in user logs comparing to the dataset of marketing costs.

### Analysis of all log sessions per day, followed by a quick comparison with the unique log sessions per day

In [386]:
visits_log_us.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359400 entries, 0 to 359399
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Device        359400 non-null  category      
 1   End Ts        359400 non-null  datetime64[ns]
 2   Source Id     359400 non-null  category      
 3   Start Ts      359400 non-null  datetime64[ns]
 4   Uid           359400 non-null  uint64        
 5   duration (s)  359400 non-null  float64       
 6   Day           359400 non-null  object        
dtypes: category(2), datetime64[ns](2), float64(1), object(1), uint64(1)
memory usage: 14.4+ MB


In [387]:
total_visits_day = visits_log_us.groupby('Day')['Uid'].count().reset_index(name = 'Count')
total_visits_day.head()

Unnamed: 0,Day,Count
0,2017-06-01,664
1,2017-06-02,658
2,2017-06-03,477
3,2017-06-04,510
4,2017-06-05,893


In [388]:
total_visits_day_fig = px.box(total_visits_day, x = 'Count')
total_visits_day_fig.update_layout(title = 'Total users per day - Boxplot',xaxis_title = 'Total users')
total_visits_day_fig.update_traces(boxmean=True)
total_visits_day_fig.show()
visit_users_by_day_box

In [389]:
print('Total user logs: ',total_visits_day['Count'].sum())
print('\n')
print('Total unique user logs: ',visit_users_by_day['Total_users'].sum())

Total user logs:  359400


Total unique user logs:  330509


By comparing the two boxplots above, the first with all the user logs (including users that logged more than once a day) and the second with all the unique user logs, we identify that the median and the mean values are very similar.

### Logs sessions duration

In [390]:
visits_log_us.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359400 entries, 0 to 359399
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Device        359400 non-null  category      
 1   End Ts        359400 non-null  datetime64[ns]
 2   Source Id     359400 non-null  category      
 3   Start Ts      359400 non-null  datetime64[ns]
 4   Uid           359400 non-null  uint64        
 5   duration (s)  359400 non-null  float64       
 6   Day           359400 non-null  object        
dtypes: category(2), datetime64[ns](2), float64(1), object(1), uint64(1)
memory usage: 14.4+ MB


In [391]:
#Logs duration data separation and count of negative numbers, positive numbers and equal to zero.
logs_duration_equals_0_total = visits_log_us[visits_log_us['duration (s)']==0]['Uid'].count()
negative_logs_durations_total = visits_log_us[visits_log_us['duration (s)']<0]['Uid'].count()
positive_logs_durations_total = visits_log_us[visits_log_us['duration (s)']>0]['Uid'].count()

#Positive logs duration dataset
positive_logs_durations_df = visits_log_us[visits_log_us['duration (s)']>0]

In [392]:
# Counting of logs based on duration (equal to 0, positive and negative)

print (f"Quantidade de logs com duração de 0 segundos: {logs_duration_equals_0_total}")
print('\n')
print (f"Quantidade de logs com valores negativos: {negative_logs_durations_total}")
print('\n')
print (f"Quantidade de logs com valores positivos: {positive_logs_durations_total}")

Quantidade de logs com duração de 0 segundos: 35794


Quantidade de logs com valores negativos: 2


Quantidade de logs com valores positivos: 323604


In [393]:
#Grouped of positive logs and their user count
pos_logs_duration_analysis = positive_logs_durations_df.groupby(['duration (s)','Day']).agg({'Uid': 'count'}).reset_index()
positive_logs_durations_df.sample()


Unnamed: 0,Device,End Ts,Source Id,Start Ts,Uid,duration (s),Day
101463,desktop,2018-01-25 11:26:00,4,2018-01-25 10:58:00,16119213273962636123,1680.0,2018-01-25


In [411]:
#Mean value for positive logs duration
mean_log_value = pos_logs_duration_analysis['duration (s)'].mean()

#Histogram for analysis of positive logs duration
pos_logs_duration_analysis_fig = px.histogram(pos_logs_duration_analysis,x = 'duration (s)')
pos_logs_duration_analysis_fig.add_vline(x=mean_log_value, line_dash='dash', line_color='black')
pos_logs_duration_analysis_fig.update_layout(title = 'Frequency of duration in seconds for users', xaxis_title = 'Duration (s)', yaxis_title = 'Frequency')
pos_logs_duration_analysis_fig


In [442]:
#Boxplot for analysis of positive logs duration
pos_logs_duration_analysis_fig = px.box(pos_logs_duration_analysis,x = 'duration (s)')

#Added a line representing the mean value
pos_logs_duration_analysis_fig.add_vline(x=mean_log_value, line_dash='dash', line_color='black', line_width=0.8)


Above we can identify that most part of our duration values are below the average and that we have outliers that showes an high increase in the log duration.

In [443]:
# Code to identify how frequently users come back to the platform

visits_log_us.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359400 entries, 0 to 359399
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Device        359400 non-null  category      
 1   End Ts        359400 non-null  datetime64[ns]
 2   Source Id     359400 non-null  category      
 3   Start Ts      359400 non-null  datetime64[ns]
 4   Uid           359400 non-null  uint64        
 5   duration (s)  359400 non-null  float64       
 6   Day           359400 non-null  object        
dtypes: category(2), datetime64[ns](2), float64(1), object(1), uint64(1)
memory usage: 14.4+ MB


In [476]:
#Creation of users frequency dataframe and median data for line creation on histogram
users_frequency = visits_log_us.groupby('Uid').agg({'Day': 'nunique'}).reset_index().sort_values(by = 'Uid', ascending = True)
median_us_freq = users_frequency['Day'].median()

#Users log frequency - Histogram
users_frequency_fig = px.histogram(users_frequency, x = 'Day',nbins = 250,text_auto=True)
users_frequency_fig.add_vline(x = median_us_freq, line_dash = 'dash',line_color = 'black')


In [478]:
#Users log frequency - Histogram
users_frequency_box_fig = px.box(users_frequency, x = 'Day')
users_frequency_box_fig.add_vline(x = median_us_freq, line_dash = 'dash',line_color = 'black')


In [450]:
users_frequency.head()

Unnamed: 0,Uid,Day
0,11863502262781,1
1,49537067089222,1
2,297729379853735,1
3,313578113262317,3
4,325320750514679,2
