### Load the required libraries and functions 

In [1]:
import warnings
import pickle
import re
import joblib
import random
import sys
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import timedelta

project_path = '/Users/naresh/Downloads/DS/growth/nsl_v2/nsl_v2_final/'
sys.path.insert(0, project_path+'config')
from config import SQLQuery

random.seed(3)
sns.set_theme(style="ticks", color_codes=True)
color_map = {1: '#00cc96', 0: '#636efa'}
# Avoid warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

In [2]:
df_train_test = pd.read_pickle(project_path+'data/nsl_train_test_flag_2023-05-30.pkl')
df_segment = pd.read_pickle(project_path+'data/segment_raw_dataset_2023-05-30.pkl')

df = pd.merge(df_segment,df_train_test[['application_id','train_flag','test_flag']], on='application_id', how='inner')

In [3]:
# visits per page for all the common pages - done
# no.of unique page visits till 13th question - done
# time spent per unique page till 13th question - difficult to build logic for this data
# total time spent till 13th question - not explored

### Final modules for the modelling

In [4]:
def segment_features_1(df:pd.DataFrame, cols_list:list, app_level_df:pd.DataFrame, training:bool=True):
    """ 
        df : segment raw data
        cols_list: columns list on which the operations are performed
        app_level_df: application level data
        
    """
    
    new_cols = []
    for col in cols_list:
        tmp = df[['application_id',col]].drop_duplicates(subset=['application_id',col], keep='first')
        tmp2 = pd.DataFrame(tmp.application_id.value_counts()).reset_index()
        col = col+'_count'
        tmp2.rename(columns={'application_id':col, 'index':'application_id'}, inplace=True)
    
        # Merging with app level df
        app_level_df = pd.merge(app_level_df, tmp2, on='application_id', how='left')
        new_cols.append(col)
        
    if training:
        df_impute_segment = app_level_df[new_cols].median()
        df_impute_segment = pd.DataFrame(df_impute_segment, columns=['impute_value']).reset_index().rename(columns={'index':'feature'})
        df_impute_segment.to_pickle(project_path+'models/df_impute_segment.pkl') # Save the impute values as df

    else:
        df_impute_segment = pd.read_pickle(project_path+'models/df_impute_segment.pkl') # Load the impute values as df

    # screen height and width features
    tmp2 = pd.DataFrame()
    tmp2 = pd.concat([tmp2, df[['application_id','screen_height','screen_width']]])
    tmp2.drop_duplicates(subset=['application_id','screen_height','screen_width'],keep='first', inplace=True)

    new_cols2 = []
    list_value_type = ['max','min','mean','median']
    for value_type in list_value_type:
        colw = 'screen_width'
        tmp = pd.DataFrame(tmp2.groupby(['application_id'])[colw].agg([value_type]))
        tmp[value_type] = tmp[value_type].astype('float')
        tmp.rename(columns={value_type:colw+'_'+value_type}, inplace=True) 
        app_level_df = pd.merge(app_level_df, tmp, on='application_id', how='left')
        
        new_cols2.append(colw+'_'+value_type)

        colh = 'screen_height'
        tmp = pd.DataFrame(tmp2.groupby(['application_id'])[colh].agg([value_type]))
        tmp[value_type] = tmp[value_type].astype('float')
        tmp.rename(columns={value_type:colh+'_'+value_type}, inplace=True) 
        app_level_df = pd.merge(app_level_df, tmp, on='application_id', how='left')
        
        new_cols2.append(colh+'_'+value_type)
        
    if training:
        df_impute_segment2 = app_level_df[new_cols2].median()
        df_impute_segment2 = pd.DataFrame(df_impute_segment2, columns=['impute_value']).reset_index().rename(columns={'index':'feature'})
        df_impute_segment2.to_pickle(project_path+'models/df_impute_segment2.pkl') # Save the impute values as df

    else:
        df_impute_segment2 = pd.read_pickle(project_path+'models/df_impute_segment2.pkl') # Load the impute values as df

    return app_level_df


In [5]:
def segment_features_2(df:pd.DataFrame, app_level_df:pd.DataFrame, training:bool=True):
    """ 
        df : segment raw data
        cols_list: columns list on which the operations are performed
        app_level_df: application level data
        
    """
    # Cleaning the list of owner_ids
    def clean_string(x):
        if x != None:
            x = str(x)
            x = [val.strip("""'|[| |"|,|]""") for val in x.split('/') if not val.strip("""'|[| |"|,|]""") in ['',None] and len(val.strip("""'|[| |"|,|]"""))<36]
        return '/'.join(x)

    df['context_page_path_clean'] = df['context_page_path'].apply(clean_string)
    # One-hot encoding the unique page paths
    col = 'context_page_path_clean'
    tmp = df[col].str.get_dummies()
    tmp['application_id'] = df.application_id
    # Common pages that every applicant must visit
    common_pages = ['verify-email-otp','welcome','app/applicant/personal-info','app/applicant/phone'
    ,'app/applicant/otp-verify','app/applicant/address','app/applicant/dob-ssn','app/business/business-type'
    ,'app/business/address','app/business/other-info','app/business-questions/about-business'
    ,'app/business-questions/incoming']

    # no.of visits per page
    tmp2 = tmp[common_pages+['application_id']].groupby(['application_id']).sum()
    tmp2.reset_index(drop=False, inplace=True)
    tmp2 = pd.merge(app_level_df, tmp2, on='application_id', how='left')
    
    # no.of unique pages per user
    tmp3 = tmp.groupby(['application_id']).sum()
    tmp3.reset_index(drop=False, inplace=True)
    tmp3 = pd.merge(tmp3, app_level_df[['application_id']], on='application_id', how='left')
    drop_cols = ['404','app/applicant','app/business','application-denied','forgot-password','signup','status','undefined']
    drop_cols = [col for col in tmp3.columns.to_list() if col in drop_cols]
    tmp3 = tmp3.drop(columns=drop_cols)
    tmp3 = tmp3.dropna(subset=['application_id'])
    
    tmp4 = pd.DataFrame()
    tmp4['application_id'] = tmp3.application_id
    cols = tmp3.drop(columns=['application_id']).columns.to_list()
    tmp4[cols] = pd.DataFrame(np.where(tmp3[cols]>=1, 1, 0))
    tmp4 = tmp4.set_index('application_id')
    tmp4 = pd.DataFrame(tmp4.sum(axis=1), columns=['page_count'])
    tmp2 = pd.merge(tmp2,tmp4,on='application_id',how='left')

    if training:
        df_impute_segment3 = tmp2[common_pages+['page_count']].median()
        df_impute_segment3 = pd.DataFrame(df_impute_segment3, columns=['impute_value']).reset_index().rename(columns={'index':'feature'})
        df_impute_segment3.to_pickle(project_path+'models/df_impute_segment3.pkl') # Save the impute values as df

    else:
        df_impute_segment3 = pd.read_pickle(project_path+'models/df_impute_segment3.pkl') # Load the impute values as df

    return tmp2


In [6]:
def segment_features_3(app_level_df:pd.DataFrame):
    """ 
        df : segment raw data
        cols_list: columns list on which the operations are performed
        app_level_df: application level data
        
    """
    impute1 = pd.read_pickle(project_path+'models/df_impute_segment.pkl')
    impute2 = pd.read_pickle(project_path+'models/df_impute_segment2.pkl')
    impute3 = pd.read_pickle(project_path+'models/df_impute_segment3.pkl')
    
    ############# PART-1 #############
    # Function to fill nulls with median values
    def fill_null_values(df_impute:pd.DataFrame, data_df:pd.DataFrame,):
        df_dict = dict(df_impute.values)
        impute_cols = df_impute['feature'].to_list()
        for col in data_df.columns.to_list():
            if col in impute_cols:
                data_df[col] = data_df[col].fillna(df_dict[col])
                impute_cols.remove(col)
        return data_df

    # Filling nulls with median
    app_level_df = fill_null_values(impute1, app_level_df)
    app_level_df['sh_sw_ratio_count'] = app_level_df['screen_height_count']/app_level_df['screen_width_count']
    app_level_df['sh_sw_ratio_count'] = app_level_df['sh_sw_ratio_count'].astype('float')
    
    cols_list_1 = impute1.feature.to_list()
    # Feature Engg
    for col in cols_list_1:
        app_level_df[col] = np.where(app_level_df[col]==1,1,0)
        

    ############# PART-2 #############
    # Filling nulls with median
    app_level_df = fill_null_values(impute2, app_level_df)

    # Function to fill zero screen width and height with median values
    def fill_zero_values(df_impute:pd.DataFrame, data_df:pd.DataFrame,):
        df_dict = dict(df_impute.values)
        impute_cols = df_impute['feature'].to_list()
        for col in data_df.columns.to_list():
            if col in impute_cols:
                data_df[col] = np.where(data_df[col]==0, df_dict[col], data_df[col])
                impute_cols.remove(col)
        return data_df
    
    # Filling zeros with median
    app_level_df = fill_zero_values(impute2, app_level_df)
    # Creating Ratios
    list_value_type = ['max','min','mean','median']
    for value_type in list_value_type:
        colh = 'screen_height'
        colw = 'screen_width'
        app_level_df['sh_sw_ratio_'+value_type] = app_level_df[colh+'_'+value_type]/app_level_df[colw+'_'+value_type]

    
    ############# PART-3 #############
    cols_list_3 = impute3.feature.to_list()
    app_level_df = fill_null_values(impute3, app_level_df)
    
    return app_level_df

### Train Data

In [7]:
x_train = df[df['train_flag'] == 1].reset_index(drop=True)
# Filter only the existing applications in segment data
x_train = x_train[~x_train.received_at.isnull()]
x_train.reset_index(drop=True, inplace=True)
x_train = x_train.sort_values(by=['application_id','received_at'])


# Change data type
cols = ['application_id','user_id','owner_id','anonymous_id','context_page_path','timezone']
x_train[cols] = x_train[cols].astype('string')
x_train[['screen_width','screen_height']] = x_train[['screen_width','screen_height']].astype('int')

for col in x_train.columns:
    if col != 'user_id':
        idx = x_train.index[x_train[col].isnull()].tolist()
        idx.extend(x_train.index[x_train[col].isna()].tolist())
        idx.extend(x_train.index[x_train[col] == ''].tolist())
        idx.extend(x_train.index[x_train[col] == '[]'].tolist())
        idx = list(set(idx))
        x_train.loc[idx, col] = None    

cols_list = ['timezone','user_id','owner_id','anonymous_id','context_ip','screen_width','screen_height']
# Creating df with all apps
app_level_data = pd.DataFrame()
app_level_data['application_id'] = x_train.application_id.unique()

app_level_data = segment_features_1(df=x_train, cols_list=cols_list, app_level_df=app_level_data, training=True)
app_level_data = segment_features_2(df=x_train, app_level_df=app_level_data, training=True)


df_tmp = pd.merge(df_train_test[['application_id', 'train_flag', 'test_flag','ns_flag']], app_level_data,
              on='application_id', how='left')
x_train = df_tmp[df_tmp['train_flag'] == 1].reset_index(drop=True)
y_train = x_train['ns_flag']
app_level_data_train = segment_features_3(app_level_df=x_train)

In [8]:
# num_cols = app_level_data_train.drop(columns=['application_id']).columns.to_list()

# hue_col = 'ns_flag'

# f, axes = plt.subplots(len(num_cols),2, figsize=(8, 100), sharex=False)
# for i, col in enumerate(num_cols):
#     sns.violinplot(y=col, x=hue_col, orient='v', data=app_level_data_train, ax=axes[i,0])
    
#     # Capping upper and lower limits using IQR whiskers
#     upper_limit = float(app_level_data_train[col].quantile([0.75]).values + 1.5*(app_level_data_train[col].quantile([0.75]).values - app_level_data_train[col].quantile([0.25]).values))
#     lower_limit = float(app_level_data_train[col].quantile([0.25]).values - 1.5*(app_level_data_train[col].quantile([0.75]).values - app_level_data_train[col].quantile([0.25]).values))
#     df_tmp = app_level_data_train[(app_level_data_train[col]<=upper_limit) & (app_level_data_train[col]>=lower_limit)].reset_index(drop=True)
    
#     sns.violinplot(y=col, x=hue_col, orient='v', data=df_tmp, ax=axes[i,1])
    
# plt.tight_layout()


In [9]:
app_level_data_train.shape

(87292, 37)

In [10]:
app_level_data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87292 entries, 0 to 87291
Data columns (total 37 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   application_id                         87292 non-null  object 
 1   train_flag                             87292 non-null  float64
 2   test_flag                              87292 non-null  float64
 3   ns_flag                                87292 non-null  int64  
 4   timezone_count                         87292 non-null  int64  
 5   user_id_count                          87292 non-null  int64  
 6   owner_id_count                         87292 non-null  int64  
 7   anonymous_id_count                     87292 non-null  int64  
 8   context_ip_count                       87292 non-null  int64  
 9   screen_width_count                     87292 non-null  int64  
 10  screen_height_count                    87292 non-null  int64  
 11  sc

In [11]:
app_level_data_train.to_pickle(project_path+'data/segment_processed_data_train.pkl')

#### Test Data

In [12]:
x_test = df[df['test_flag'] == 1].reset_index(drop=True)

# Filter only the existing applications in segment data
x_test = x_test[~x_test.received_at.isnull()]
x_test.reset_index(drop=True, inplace=True)
x_test = x_test.sort_values(by=['application_id','received_at'])


# Change data type
cols = ['application_id','user_id','owner_id','anonymous_id','context_page_path','timezone']
x_test[cols] = x_test[cols].astype('string')
x_test[['screen_width','screen_height']] = x_test[['screen_width','screen_height']].astype('int')

for col in x_test.columns:
    if col != 'user_id':
        idx = x_test.index[x_test[col].isnull()].tolist()
        idx.extend(x_test.index[x_test[col].isna()].tolist())
        idx.extend(x_test.index[x_test[col] == ''].tolist())
        idx.extend(x_test.index[x_test[col] == '[]'].tolist())
        idx = list(set(idx))
        x_test.loc[idx, col] = None

cols_list = ['timezone','user_id','owner_id','anonymous_id','context_ip','screen_width','screen_height']
# Creating df with all apps
app_level_data = pd.DataFrame()
app_level_data['application_id'] = x_test.application_id.unique()

app_level_data = segment_features_1(df=x_test, cols_list=cols_list, app_level_df=app_level_data, training=False)
app_level_data = segment_features_2(df=x_test, app_level_df=app_level_data, training=False)


df_tmp = pd.merge(df_train_test[['application_id', 'train_flag', 'test_flag','ns_flag']], app_level_data,
              on='application_id', how='left')
x_test = df_tmp[df_tmp['test_flag'] == 1].reset_index(drop=True)
y_test = x_test['ns_flag']
app_level_data_test = segment_features_3(app_level_df=x_test)

In [13]:
app_level_data_test.shape

(37411, 37)

In [14]:
app_level_data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37411 entries, 0 to 37410
Data columns (total 37 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   application_id                         37411 non-null  object 
 1   train_flag                             37411 non-null  float64
 2   test_flag                              37411 non-null  float64
 3   ns_flag                                37411 non-null  int64  
 4   timezone_count                         37411 non-null  int64  
 5   user_id_count                          37411 non-null  int64  
 6   owner_id_count                         37411 non-null  int64  
 7   anonymous_id_count                     37411 non-null  int64  
 8   context_ip_count                       37411 non-null  int64  
 9   screen_width_count                     37411 non-null  int64  
 10  screen_height_count                    37411 non-null  int64  
 11  sc

In [15]:
app_level_data_test.to_pickle(project_path+'data/segment_processed_data_test.pkl')

### OOT Data

In [16]:
segment_pages_oot = pd.read_pickle(project_path+'data/segment_oot_dataset_2023-05-30.pkl')
# Filter only the existing applications in segment data
segment_pages_oot = segment_pages_oot[~segment_pages_oot.received_at.isnull()]
segment_pages_oot.reset_index(drop=True, inplace=True)
segment_pages_oot = segment_pages_oot.sort_values(by=['application_id','received_at'])

# Change data type
cols = ['application_id','user_id','owner_id','anonymous_id','context_page_path','timezone']
segment_pages_oot[cols] = segment_pages_oot[cols].astype('string')
segment_pages_oot[['screen_width','screen_height']] = segment_pages_oot[['screen_width','screen_height']].astype('int')

for col in segment_pages_oot.columns:
    if col != 'user_id':
        idx = segment_pages_oot.index[segment_pages_oot[col].isnull()].tolist()
        idx.extend(segment_pages_oot.index[segment_pages_oot[col].isna()].tolist())
        idx.extend(segment_pages_oot.index[segment_pages_oot[col] == ''].tolist())
        idx.extend(segment_pages_oot.index[segment_pages_oot[col] == '[]'].tolist())
        idx = list(set(idx))
        segment_pages_oot.loc[idx, col] = None    


df_oot = pd.read_pickle(project_path+'data/nsl_oot_dataset_2023-05-30.pkl')

df_oot_tmp = pd.merge(segment_pages_oot, df_oot[['application_id', 'ns_flag']], on='application_id', how='inner')
x_oot = df_oot_tmp.reset_index(drop=True)

cols_list = ['timezone','user_id','owner_id','anonymous_id','context_ip','screen_width','screen_height']
# Creating df with all apps
app_level_data = pd.DataFrame()
app_level_data['application_id'] = x_oot.application_id.unique()

app_level_data = segment_features_1(df=x_oot, cols_list=cols_list, app_level_df=app_level_data, training=False)
app_level_data = segment_features_2(df=x_oot, app_level_df=app_level_data, training=False)

df_oot = pd.merge(df_oot[['application_id', 'ns_flag']], app_level_data, on='application_id', how='left')
x_oot = df_oot.reset_index(drop=True)
y_oot = x_oot['ns_flag']
app_level_data_oot = segment_features_3(app_level_df=x_oot)


In [17]:
app_level_data_oot.shape

(25374, 35)

In [18]:
app_level_data_oot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25374 entries, 0 to 25373
Data columns (total 35 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   application_id                         25374 non-null  object 
 1   ns_flag                                25374 non-null  int64  
 2   timezone_count                         25374 non-null  int64  
 3   user_id_count                          25374 non-null  int64  
 4   owner_id_count                         25374 non-null  int64  
 5   anonymous_id_count                     25374 non-null  int64  
 6   context_ip_count                       25374 non-null  int64  
 7   screen_width_count                     25374 non-null  int64  
 8   screen_height_count                    25374 non-null  int64  
 9   screen_width_max                       25374 non-null  float64
 10  screen_height_max                      25374 non-null  float64
 11  sc

In [19]:
app_level_data_oot.to_pickle(project_path+'data/segment_processed_data_oot.pkl')