In [1]:
import pandas as pd

# Load the dataset
file_path = '~/Downloads/data-1726344898142.csv'
data = pd.read_csv(file_path)

# Convert timestamp to datetime for better handling
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Separate the data into two groups: those who posted and those who didn't
posted_users = data[data['ever_posted'] == True]
non_posted_users = data[data['ever_posted'] == False]

# Filter sequences up to the 'parse' screen
posted_users_before_parse = posted_users[posted_users['screen_name'] != 'parse']
non_posted_users_before_parse = non_posted_users[non_posted_users['screen_name'] != 'parse']

# Count the sequence of screens before the 'parse' screen for each user
posted_sequence_counts = posted_users_before_parse.groupby(['user_id', 'screen_name']).size().unstack(fill_value=0)
non_posted_sequence_counts = non_posted_users_before_parse.groupby(['user_id', 'screen_name']).size().unstack(fill_value=0)

# Summarize the overall screen visit sequences for both groups
posted_sequence_summary = posted_sequence_counts.sum().sort_values(ascending=False)
non_posted_sequence_summary = non_posted_sequence_counts.sum().sort_values(ascending=False)

# Combine the summaries into a single DataFrame for comparison
comparison_df = pd.DataFrame({
    'Posted Users': posted_sequence_summary,
    'Non-Posted Users': non_posted_sequence_summary
}).fillna(0)

# Normalize the sequence counts by the number of users in each group
num_posted_users = posted_users['user_id'].nunique()
num_non_posted_users = non_posted_users['user_id'].nunique()

normalized_comparison_df = pd.DataFrame({
    'Posted Users (Normalized)': posted_sequence_summary / num_posted_users,
    'Non-Posted Users (Normalized)': non_posted_sequence_summary / num_non_posted_users
}).fillna(0)

# Identify screens visited by posted users vs non-posted users
posted_screens = set(posted_users['screen_name'].unique())
non_posted_screens = set(non_posted_users['screen_name'].unique())

# Screens visited by posted users but not by non-posted users
screens_only_posted_visit = posted_screens - non_posted_screens

# Screens visited by non-posted users but not by posted users
screens_only_non_posted_visit = non_posted_screens - posted_screens

# Identify where non-posted users drop off:
# Look at the last screen visited by non-posted users before they stop interacting
non_posted_last_screens = non_posted_users.groupby('user_id')['screen_name'].last().value_counts()

# Analyze the role of the 'account' screen among those who post vs those who don't
account_screen_posted = posted_users[posted_users['screen_name'] == 'account']
account_screen_non_posted = non_posted_users[non_posted_users['screen_name'] == 'account']

account_screen_posted_freq = account_screen_posted.shape[0] / num_posted_users
account_screen_non_posted_freq = account_screen_non_posted.shape[0] / num_non_posted_users

def average_position(df):
    df = df[df['screen_name'] == 'account']
    return df.groupby('user_id').cumcount().mean()

account_screen_posted_position = average_position(posted_users)
account_screen_non_posted_position = average_position(non_posted_users)

account_screen_analysis = {
    'Frequency of Visits (Posted Users)': account_screen_posted_freq,
    'Frequency of Visits (Non-Posted Users)': account_screen_non_posted_freq,
    'Average Position in Sequence (Posted Users)': account_screen_posted_position,
    'Average Position in Sequence (Non-Posted Users)': account_screen_non_posted_position
}

# Analyze the role of the 'entry' screen drop-off for users that post vs those that don't post
entry_screen_posted = posted_users[posted_users['screen_name'] == 'entry']
entry_screen_non_posted = non_posted_users[non_posted_users['screen_name'] == 'entry']

entry_screen_posted_freq = entry_screen_posted.shape[0] / num_posted_users
entry_screen_non_posted_freq = entry_screen_non_posted.shape[0] / num_non_posted_users

posted_last_screen_entry = posted_users.groupby('user_id')['screen_name'].last().eq('entry').mean()
non_posted_last_screen_entry = non_posted_users.groupby('user_id')['screen_name'].last().eq('entry').mean()

entry_screen_analysis = {
    'Frequency of Visits (Posted Users)': entry_screen_posted_freq,
    'Frequency of Visits (Non-Posted Users)': entry_screen_non_posted_freq,
    'Entry Screen as Last Screen Before Drop-Off (Posted Users)': posted_last_screen_entry,
    'Entry Screen as Last Screen Before Drop-Off (Non-Posted Users)': non_posted_last_screen_entry
}

# Output the analysis results
print("Screen Visit Sequence Comparison:")
print(comparison_df)

print("\nNormalized Screen Visit Sequence Comparison:")
print(normalized_comparison_df)

print("\nScreens Only Posted Users Visit:")
print(screens_only_posted_visit)

print("\nScreens Only Non-Posted Users Visit:")
print(screens_only_non_posted_visit)

print("\nMost Common Drop-Off Screens for Non-Posted Users:")
print(non_posted_last_screens.head(10))

print("\nAccount Screen Analysis:")
print(account_screen_analysis)

print("\nEntry Screen Analysis:")
print(entry_screen_analysis)


Screen Visit Sequence Comparison:
                         Posted Users  Non-Posted Users
screen_name                                            
account                            62              35.0
account-danger-zone                 2               2.0
account-feature-details            18               1.0
account-features                   39               5.0
account-reminder                    2               1.0
account-subscriptions              18               9.0
account-support                     1               0.0
characterdetail                    21               0.0
characters                         53               8.0
credits-history                     1               0.0
entities                           36              10.0
entitydetail                       22               0.0
entry                             103              31.0
entrydetail                        30               0.0
epic-gallery                        4               0.0
epics         