In [1]:
import pandas as pd
import numpy as np
import re 
from collections import Counter

In [2]:
df=pd.read_csv("clean_data.csv")

In [3]:
df.shape

(1350, 3)

In [4]:
df.head()

Unnamed: 0,user_id,subscription_type,user_journey
0,1516,Annual,Homepage-Log in-Other
1,3395,Annual,Other
2,10107,Annual,Homepage
3,11145,Monthly,Homepage-Log in
4,12400,Monthly,Homepage-Career tracks-Sign up-Log in


In [5]:
df.tail()

Unnamed: 0,user_id,subscription_type,user_journey
1345,509060,Annual,Other
1346,509061,Annual,Coupon
1347,509085,Annual,Coupon
1348,509095,Annual,Other
1349,509096,Annual,Other-Coupon


In [6]:
df['subscription_type'].unique()

array(['Annual', 'Monthly', 'Quarterly'], dtype=object)

In [7]:
df[df['subscription_type']=='Annual']['subscription_type'].shape

(931,)

#### function to filter out data via subcription type

In [8]:
def filter_vai_subcription(data,filter="all",target_column='subscription_type'):
    df=data.copy()
    
    if filter=="all":
        mask=df
    else:
        mask=data[data[target_column]==filter]
    return mask

In [9]:
filter_vai_subcription(df,filter='Annual')

Unnamed: 0,user_id,subscription_type,user_journey
0,1516,Annual,Homepage-Log in-Other
1,3395,Annual,Other
2,10107,Annual,Homepage
7,15630,Annual,Log in
9,19458,Annual,Homepage-Sign up
...,...,...,...
1345,509060,Annual,Other
1346,509061,Annual,Coupon
1347,509085,Annual,Coupon
1348,509095,Annual,Other


#### creating a function for splitting the user_journry into a list

In [10]:
def split_user_jpurney(data,target_column='user_journey'):
    data[target_column]=data[target_column].str.split("-")
    return data

In [11]:
df=split_user_jpurney(df)
df

Unnamed: 0,user_id,subscription_type,user_journey
0,1516,Annual,"[Homepage, Log in, Other]"
1,3395,Annual,[Other]
2,10107,Annual,[Homepage]
3,11145,Monthly,"[Homepage, Log in]"
4,12400,Monthly,"[Homepage, Career tracks, Sign up, Log in]"
...,...,...,...
1345,509060,Annual,[Other]
1346,509061,Annual,[Coupon]
1347,509085,Annual,[Coupon]
1348,509095,Annual,[Other]


## Metrices

#### Page Count

In [12]:
def page_count(data,target_column='user_journey',subcription_type='all',sort=False):
    if subcription_type=='all':
        data=filter_vai_subcription(data=data,filter='all')
    else:
        data=filter_vai_subcription(data=data,filter=subcription_type)
        
    all_pages=[]
    for i in data[target_column]:
        for j in range(len(i)):
            all_pages.append(i[j])
    
    counts=Counter(all_pages)
    counts['Total Count']=sum(counts.values())

    page_count_df=pd.DataFrame(list(counts.items()),columns=['Pages','Count'])
    page_count_df=page_count_df.sort_values(by='Count',ascending=sort)
    page_count_df.reset_index(drop=True,inplace=True)
    return page_count_df

In [13]:
page_count(df,subcription_type='all',sort=False)

Unnamed: 0,Pages,Count
0,Total Count,3282
1,Homepage,634
2,Sign up,480
3,Other,417
4,Courses,344
5,Career tracks,326
6,Pricing,298
7,Coupon,205
8,Career track certificate,170
9,Log in,168


#### Page Presence 

In [14]:
def page_presence(data,target_column='user_journey',subcription_type='all',sort=False):
    if subcription_type=='all':
        data=filter_vai_subcription(data=data,filter='all')
    else:
        data=filter_vai_subcription(data=data,filter=subcription_type)
        
    all_pages=[]
    for i in data[target_column].apply(lambda x : list(set(x))):
        for j in range(len(i)):
            all_pages.append(i[j])
    
    counts=Counter(all_pages)
    counts['Total Count']=sum(counts.values())
    counts['Total Journey']=data.shape[0]
    
    page_presence_df=pd.DataFrame(list(counts.items()),columns=['Pages','Count'])
    page_presence_df=page_presence_df.sort_values(by='Count',ascending=sort)
    page_presence_df.reset_index(drop=True,inplace=True)
    return page_presence_df

In [15]:
page_presence(df,subcription_type="all")

Unnamed: 0,Pages,Count
0,Total Count,2817
1,Total Journey,1350
2,Homepage,525
3,Sign up,434
4,Other,411
5,Courses,273
6,Pricing,251
7,Career tracks,217
8,Coupon,205
9,Log in,162


#### Get page Set

In [16]:
def get_page_set(data=df,target_column='user_journey'):
    lis=[]
    for i in data[target_column].apply(lambda x : list(set(x))):
        for j in i : lis.append(j)
    return list(set(lis))

In [17]:
get_page_set()

['Other',
 'Course certificate',
 'Log in',
 'Courses',
 'Career tracks',
 'Sign up',
 'Instructors',
 'Upcoming courses',
 'Pricing',
 'Checkout',
 'Resources center',
 'Success stories',
 'About us',
 'Coupon',
 'Career track certificate',
 'Homepage',
 'Blog']

#### Page Destination

In [53]:
def page_destination(data=df,target_column='user_journey',subcription_type='all',sort=False):
    
    if subcription_type!='all':
        data=filter_vai_subcription(data=df,filter=subcription_type)
    
    results = {}
    # Iterate over each journey in the 'user_journey' column of the DataFrame
    for journey in data[target_column]:
        for i in range(len(journey) - 1):
            page_source = journey[i]
            page_next = journey[i + 1]
            
            # Initialize the nested dictionary if page_source is not already a key
            if page_source not in results:
                results[page_source] = {}
            
            # Update the count for the transition from page_source to page_next
            results[page_source][page_next] = results[page_source].get(page_next, 0) + 1
            
    # sorting the nested dictionary
    if sort:
        sorted_results={}
        for page in results:
            unsorted_dict=results[page]
            sorted_keys = sorted(unsorted_dict,key=unsorted_dict.get,reverse=False)
            sorted_results[page]={key: unsorted_dict[key] for key in sorted_keys}
        return sorted_results
    else:
        sorted_results={}
        for page in results:
            unsorted_dict=results[page]
            sorted_keys = sorted(unsorted_dict,key=unsorted_dict.get,reverse=True)
            sorted_results[page]={key: unsorted_dict[key] for key in sorted_keys}
        
        return sorted_results
            


In [56]:
page_destination(data=df,target_column='user_journey',subcription_type='Annual',sort=False)

{'Homepage': {'Pricing': 90,
  'Sign up': 69,
  'Career tracks': 69,
  'Courses': 66,
  'Career track certificate': 31,
  'Log in': 28,
  'Course certificate': 10,
  'Resources center': 9,
  'Upcoming courses': 3,
  'Other': 3,
  'Blog': 1,
  'Instructors': 1,
  'Coupon': 1},
 'Log in': {'Sign up': 14, 'Other': 4, 'Homepage': 2, 'Pricing': 1},
 'Pricing': {'Checkout': 34,
  'Sign up': 34,
  'Courses': 25,
  'Career track certificate': 18,
  'Career tracks': 11,
  'Resources center': 11,
  'Homepage': 7,
  'Log in': 6,
  'Upcoming courses': 4,
  'Course certificate': 4,
  'Blog': 1},
 'Courses': {'Sign up': 45,
  'Career tracks': 36,
  'Pricing': 19,
  'Career track certificate': 11,
  'Homepage': 9,
  'Course certificate': 6,
  'Resources center': 5,
  'Upcoming courses': 5,
  'Log in': 3,
  'Blog': 1,
  'Other': 1,
  'Success stories': 1},
 'Sign up': {'Homepage': 20,
  'Log in': 11,
  'Career tracks': 11,
  'Courses': 10,
  'Pricing': 8,
  'Career track certificate': 8,
  'Upcoming c

### Page Sequnce

In [75]:
def page_sequnce(data=df,target_column='user_journey',sequnce=3,subcription_type='all',sort=False):
    if subcription_type!='all':
        data=filter_vai_subcription(data=df,filter=subcription_type)

    results={}
    for i in data[target_column]:
        flag={}
        
        for j in range(len(i)-sequnce+1):
            page_combination=tuple(i[j:j +sequnce])
            
            if flag.get(page_combination,False):continue
            
            results[page_combination]=results.get(page_combination,0)+1
            flag[page_combination]=True
    # return results
    if sort:
        sorted_keys=sorted(results,key=results.get,reverse=False)
        sorted_result={key:results[key] for key in sorted_keys}
        return sorted_result
    else:
        sorted_keys=sorted(results,key=results.get,reverse=True)
        sorted_result={key:results[key] for key in sorted_keys}
        return sorted_result

In [78]:
page_sequnce(data=df,sequnce=4,subcription_type='Monthly',sort=False)

{('Career tracks', 'Courses', 'Career tracks', 'Courses'): 5,
 ('Homepage', 'Career tracks', 'Homepage', 'Career tracks'): 5,
 ('Courses', 'Career tracks', 'Courses', 'Career tracks'): 4,
 ('Homepage', 'Pricing', 'Courses', 'Career tracks'): 3,
 ('Career track certificate',
  'Career tracks',
  'Career track certificate',
  'Career tracks'): 3,
 ('Homepage', 'Career tracks', 'Courses', 'Career tracks'): 3,
 ('Courses', 'Sign up', 'Courses', 'Sign up'): 3,
 ('Homepage', 'Pricing', 'Checkout', 'Pricing'): 3,
 ('Career tracks', 'Homepage', 'Career tracks', 'Homepage'): 3,
 ('Career tracks', 'Homepage', 'Career tracks', 'Sign up'): 3,
 ('Homepage', 'Other', 'Homepage', 'Pricing'): 2,
 ('Homepage', 'Sign up', 'Homepage', 'Career tracks'): 2,
 ('Career tracks', 'Courses', 'Career tracks', 'Sign up'): 2,
 ('Pricing',
  'Career track certificate',
  'Course certificate',
  'Career track certificate'): 2,
 ('Career tracks', 'Courses', 'Career tracks', 'Homepage'): 2,
 ('Career tracks', 'Homepag

#### Average user journeys

In [81]:
def avg_journey_len(data=df,target_coulumn="user_journey",subcription_type='all'):
    if subcription_type!='all':
        data=filter_vai_subcription(data=df,filter=subcription_type)
        
    sum=0
    for i in data[target_coulumn]:
        sum+=len(i)
    return sum/data.shape[0]

In [85]:
avg_journey_len(subcription_type="all")

2.431111111111111