In [None]:
import pandas as pd

# loading all datasets
url_client_profiles = 'data/df_final_demo.txt'
url_digital_footprints1 = 'data/df_final_web_data_pt_1.txt'
url_digital_footprints2 = 'data/df_final_web_data_pt_2.txt'
url_experiment_roster = 'data/df_final_experiment_clients.txt'

df1 = pd.read_csv(url_digital_footprints1)
df2 = pd.read_csv(url_digital_footprints2)

# imported dataframes to work with
df_client_profiles = pd.read_csv(url_client_profiles)
df_exp_roster = pd.read_csv(url_experiment_roster)
# merged footprint files
df_footprints = pd.concat([df1, df2])

In [None]:
# cleaning the datasets
df_client_profiles.rename(columns={'clnt_tenure_yr': 'client_tenure_years', 'clnt_tenure_mnth': 'client_tenure_months', 'clnt_age': 'client_age', 'gendr': 'gender', 'num_accts': 'num_accounts', 'bal': 'balance', 'calls_6_mnth': 'calls_6months', 'logons_6_mnth': 'logins_6months'}, inplace=True)
df_client_profiles_cleaned = df_client_profiles.dropna(subset=["client_tenure_years", "client_tenure_months", "client_age", "gender", "num_accounts", "balance", "calls_6months", "logins_6months"], how="all")
df_exp_roster_cleaned = df_exp_roster.dropna(subset=["Variation"], how="all")
df_footprints_cleaned = df_footprints.dropna(subset=["client_id", "visitor_id", "visit_id", "process_step", "date_time"], how="all")

df_client_profiles_cleaned[df_client_profiles_cleaned['gender']=='U']
df_client_profiles_cleaned['gender'] = df_client_profiles_cleaned['gender'].apply(lambda x: 'U' if x == 'X' else x)
df_client_profiles_cleaned['gender'].fillna('U')
df_footprints_cleaned.drop_duplicates(subset=['client_id', 'visit_id', 'date_time'])

In [None]:
# joining footprints and experiment roster datasets
df_footprints_cleaned.set_index('client_id', inplace=True)
df_exp_roster_cleaned.set_index('client_id', inplace=True)

joined_df = df_footprints_cleaned.join(df_exp_roster_cleaned, how='inner')
joined_df['date_time'] = pd.to_datetime(joined_df['date_time'])
df_sorted = joined_df.sort_values(by=['Variation', 'visit_id', 'date_time'])

In [None]:
# Calculating Completion Rate: 
vistor_df = df_footprints_cleaned[df_footprints_cleaned["process_step"] == "confirm"]
vistor_df['visit_id'].nunique()

df_footprints_cleaned["visit_id"].nunique()

completion_rate = round(
    (vistor_df['visit_id'].nunique() / df_footprints_cleaned["visit_id"].nunique()) * 100, 2
)

In [None]:
# Calculating Time Spent On Each Step

# Ensure 'date_time' is in datetime format
df_footprints_cleaned['date_time'] = pd.to_datetime(df_footprints_cleaned['date_time'])

# Sort by visit_id and date_time to ensure correct time difference calculation
df_footprints_cleaned = df_footprints_cleaned.sort_values(by=['visit_id', 'date_time'])

# Calculate time spent on each step (difference between consecutive timestamps within each visit)
df_footprints_cleaned['time_spent'] = df_footprints_cleaned.groupby('visit_id')['date_time'].diff().dt.total_seconds()

# Compute the average time spent per process_step
average_time_per_step = df_footprints_cleaned.groupby('process_step')['time_spent'].mean()

# Merge the average time back into the original dataframe as a new column
df_footprints_cleaned['time_each_step'] = df_footprints_cleaned['process_step'].map(average_time_per_step)

# Round to 2 decimal places for readability
df_footprints_cleaned['time_each_step'] = df_footprints_cleaned['time_each_step'].round(2)

df_footprints_cleaned

In [None]:
# error rates calculation
possible_previous_steps = {
    'start' : None,
    'step_1' : 'start',
    'step_2' : 'step_1',
    'step_3' : 'step_2',
    'confirm' : 'step_3'
}

def check_error(df, possible_previous_steps):
    error_count = 0
    valid_count = 0
    
    for visit_id, grouped_df in df.groupby('visit_id'):
        previous_step = None
        #print(f"\nCurrent visit_id: {visit_id}")
        
        for step in grouped_df['process_step']:
            
            if previous_step is not None:
                if previous_step == step:
                    #print(f'Valid: {previous_step} to {step} is possible.')
                    valid_count += 1
                else:
                    expected_previous = possible_previous_steps[step]
                    # add an elif: for same step again (page reload)
                    if previous_step != expected_previous:
                        #print(f'ERROR: Switch from {previous_step} to {step} is not valid.')
                        error_count += 1
                    else:
                        #print(f'Valid: {previous_step} to {step} is possible.')
                        valid_count += 1
            previous_step = step
    return error_count, valid_count

check_error(df_sorted, possible_previous_steps)

In [None]:
df_client_profiles_cleaned.groupby(['gender'])['client_age'].agg(['mean', 'median', 'min', 'max', 'count'])
df_client_profiles_cleaned.groupby(['gender'])['client_tenure_years'].agg(['mean', 'median', 'min', 'max', 'count'])
df_client_profiles_cleaned.groupby(['gender'])['balance'].agg(['mean', 'median', 'min', 'max', 'count'])

# activity score calculation
df_client_profiles_cleaned['activity_score'] = df_client_profiles_cleaned.iloc[:,8] + (df_client_profiles_cleaned.iloc[:,7]*0.5) + (df_client_profiles_cleaned.iloc[:,6]*0.00001)
df_client_profiles_cleaned.sort_values('activity_score', ascending=False)

#login and call scores
high_login_threshold = df_client_profiles_cleaned['logins_6months'].quantile(0.8)
low_login_threshold = df_client_profiles_cleaned['logins_6months'].quantile(0.2)
high_call_threshold = df_client_profiles_cleaned['calls_6months'].quantile(0.8)
low_call_threshold = df_client_profiles_cleaned['calls_6months'].quantile(0.2)

def segment_customer(logins):
    if logins >= high_login_threshold:
        return "Highly Active"
    elif logins <= low_login_threshold:
        return "Inactive"
    else:
        return "Moderate"

def segment_customer(calls):
    if calls >= high_call_threshold:
        return "Highly Active"
    elif calls <= low_call_threshold:
        return "Inactive"
    else:
        return "Moderate"

df_client_profiles_cleaned["login_activity"] = df_client_profiles_cleaned["logins_6months"].apply(segment_customer)
df_client_profiles_cleaned["calls_activity"] = df_client_profiles_cleaned["calls_6months"].apply(segment_customer)

In [None]:
login_activity_counts = df_client_profiles_cleaned["login_activity"].value_counts()
print(login_activity_counts)
df_client_profiles_cleaned.groupby(["login_activity", "gender"])[["client_age", "balance", "client_tenure_years"]].mean()

In [None]:
calls_activity_counts = df_client_profiles_cleaned["calls_activity"].value_counts()
print(calls_activity_counts)
df_client_profiles_cleaned.groupby(["calls_activity", "gender"])[["client_age", "balance", "client_tenure_years"]].mean()