In [None]:
#calculate length

# Shift the 'date_time' column to compare each row with the next row
#df_web_data['next_time'] = df_web_data['date_time'].shift(-1)

# Calculate the difference between the current and next time
#df_web_data['time_diff'] = df_web_data['next_time'] - df_web_data['date_time']

# Convert time difference to minutes
#df_web_data['time_diff_minutes'] = df_web_data['time_diff'].dt.total_seconds() / 60

# Only keep relevant rows for this calculation
#df_final = df.loc[:, ['date_time', 'time_diff_minutes']]
#df_web_data.head()

In [None]:
import pandas as pd
import os
import glob
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis

In [None]:
df_client_demo = pd.read_csv("../data_files/clean/df_client_demo.csv")
df_experiment = pd.read_csv("../data_files/clean/df_experiment.csv")
df_web_data = pd.read_csv("../data_files/clean/df_web_data.csv")

In [None]:
df_client_demo.head()

In [None]:
df_client_demo.info()

In [None]:
df_experiment.head()

In [None]:
df_web_data.head()

In [None]:
client_tenure_month_summary = df_client_demo['client_tenure_month'].describe()
client_tenure_month_summary

In [None]:
client_age_summary = df_client_demo['client_age'].describe()
client_age_summary

In [None]:
# Box Plot of Client Ages
plt.figure(figsize=(9, 6))
sns.boxplot(y=df_client_demo['client_age'], color='salmon')
plt.title('Box Plot of Client Ages')
plt.ylabel('Age')
plt.show()

In [None]:
# Box Plot of Client Tenure Months
plt.figure(figsize=(9, 6))
sns.boxplot(y=df_client_demo['client_tenure_month'], color='cornflowerblue')
plt.title('Box Plot of Client Tenure Months')
plt.ylabel('Months')
plt.show()

In [None]:
age_skewness_value = skew(df_client_demo['client_age'])
age_skewness_value
# This suggests that your distribution is nearly symmetrical, with no significant skew to either the left or right.
# The data is more or less evenly distributed around the mean, closer in shape to a normal distribution.

In [None]:
age_kurtosis_value = kurtosis(df_client_demo['client_age'])
age_kurtosis_value
# This suggests that the distribution of your data has lighter tails and a flatter peak than a normal distribution.
# In practical terms, this means the data produces fewer extreme values or outliers compared to a normal distribution.

In [None]:
tenure_month_skewness_value = skew(df_client_demo['client_tenure_month'])
tenure_month_skewness_value
# The distribution has a longer tail extending towards the higher values.
# This positive skewness suggests that there are outliers or a few high values that are pulling the mean to the right.

In [None]:
tenure_month_kurtosis_value = kurtosis(df_client_demo['client_tenure_month'])
tenure_month_kurtosis_value
# The distribution has slightly heavier tails and a somewhat sharper peak than a normal distribution, indicating a moderate presence of outliers.
# The data is generally clustered around the mean more tightly than it would be in a normal distribution, with a few higher-than-usual deviations from the mean.

In [None]:
# Calculate the Pearson correlation coefficient for age vs tenure months
correlation = df_client_demo['client_age'].corr(df_client_demo['client_tenure_month'])
print(f"Correlation between client_age and client_tenure_month: {correlation:.2f}")
# Although positive, a 0,31 coefficient reflects a no strong correlation

In [None]:
# Create a scatter plot to visualize the relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(x='client_age', y='client_tenure_month', data=df_client_demo, color='blue')
# Add a line for better visual of trend
sns.regplot(x='client_age', y='client_tenure_month', data=df_client_demo, scatter=False, color='red', ci=None)
# Add titles and labels
plt.title('Scatter Plot of Client Age vs. Client Tenure Month')
plt.xlabel('Client Age')
plt.ylabel('Client Tenure (Months)')
# Display the plot
plt.show()

In [None]:
client_balance_summary = df_client_demo['balance'].describe()
client_balance_summary

We merge web data with final experimentt in order to have in the same df the client id linked to its variation (if it is control or test).

In [None]:
df_web_data_merged = pd.merge(df_web_data, df_experiment[['client_id', 'variation']], on='client_id', how='left')
df_web_data_merged.head()

In [None]:
df_web_data_merged.info()

In [None]:
# After merging, we decided to drop all null values in the variation column since they are not part of the experiment.
df_web_data_merged_cleaned = df_web_data_merged.dropna(subset=['variation'])
df_web_data_merged_cleaned.info()

We split df into two df one for "Control" and other for "Test" users

In [None]:
df_web_data_merged_control = df_web_data_merged_cleaned[df_web_data_merged_cleaned['variation'] == 'Control']
df_web_data_merged_control.info()

In [None]:
df_web_data_merged_test = df_web_data_merged_cleaned[df_web_data_merged_cleaned['variation'] == 'Test']
df_web_data_merged_test.info()

In [None]:
#check that we did not lose information in this transformation

total_cleaned = df_web_data_merged_cleaned.shape[0]
total_control = df_web_data_merged_control.shape[0]
total_test = df_web_data_merged_test.shape[0]
if total_cleaned == (total_control + total_test):
    print("Row counts match: ", total_cleaned)
else:
    print("Row counts do not match.")
    print(f"Total Cleaned: {total_cleaned}, Control + Test: {total_control + total_test}")

In [155]:
df_web_data_merged_control.to_csv('web_data_merged_control.csv', index=False)
df_web_data_merged_test.to_csv('web_data_merged_test.csv', index=False)