In [12]:
import pandas as pd


# Check the columns in the dataset
print(df_user_behavior_logs.columns)

# Calculate the user-merchant interaction frequency
interaction_frequency = df_user_behavior_logs.groupby(['user_id']).size().reset_index(name='interaction_count')

# Merge the interaction frequency with the original dataset
df_merged = pd.merge(df_user_behavior_logs, interaction_frequency, on='user_id', how='left')

# Print the updated dataset
print(df_merged.head())


Index(['user_id', 'item_id', 'cat_id', 'seller_id', 'brand_id', 'time_stamp',
       'action_type'],
      dtype='object')
   user_id  item_id  cat_id  seller_id  brand_id  time_stamp  action_type  \
0   309587     1446     611       2353    2144.0         925            0   
1   309587     1446     611       2353    2144.0         925            0   
2   201592     1450     229        142    1845.0         717            0   
3   370402     1450     229        142    1845.0         707            0   
4   248429     1450     229        142    1845.0         618            0   

   interaction_count  
0                  2  
1                  2  
2                  1  
3                  1  
4                  2  


In [13]:
import pandas as pd

# Filter the data for purchase actions only
df_purchases = df_user_behavior_logs[df_user_behavior_logs['action_type'] == 2]

# Calculate the purchase frequency for each user
purchase_frequency = df_purchases.groupby('user_id').size().reset_index(name='purchase_count')

# Merge the purchase frequency with the original dataset
df_merged = pd.merge(df_user_behavior_logs, purchase_frequency, on='user_id', how='left')

# Print the updated dataset
print(df_merged.head())


   user_id  item_id  cat_id  seller_id  brand_id  time_stamp  action_type  \
0   309587     1446     611       2353    2144.0         925            0   
1   309587     1446     611       2353    2144.0         925            0   
2   201592     1450     229        142    1845.0         717            0   
3   370402     1450     229        142    1845.0         707            0   
4   248429     1450     229        142    1845.0         618            0   

   purchase_count  
0             NaN  
1             NaN  
2             NaN  
3             NaN  
4             1.0  


In [14]:
import pandas as pd

# Calculate the user-merchant category diversity
user_merchant_category_diversity = df_user_behavior_logs.groupby(['user_id', 'seller_id'])['cat_id'].nunique().reset_index(name='category_diversity')

# Merge the category diversity with the original dataset
df_merged = pd.merge(df_user_behavior_logs, user_merchant_category_diversity, on=['user_id', 'seller_id'], how='left')

# Print the updated dataset
print(df_merged.head())


   user_id  item_id  cat_id  seller_id  brand_id  time_stamp  action_type  \
0   309587     1446     611       2353    2144.0         925            0   
1   309587     1446     611       2353    2144.0         925            0   
2   201592     1450     229        142    1845.0         717            0   
3   370402     1450     229        142    1845.0         707            0   
4   248429     1450     229        142    1845.0         618            0   

   category_diversity  
0                   1  
1                   1  
2                   1  
3                   1  
4                   1  


In [15]:
import pandas as pd

# Convert the 'time_stamp' column to datetime
df_user_behavior_logs['time_stamp'] = pd.to_datetime(df_user_behavior_logs['time_stamp'], format='%m%d')

# Calculate the user purchase frequency
user_purchase_frequency = df_user_behavior_logs[df_user_behavior_logs['action_type'] == 2].groupby('user_id')['time_stamp'].nunique().reset_index(name='purchase_frequency')

# Merge the purchase frequency with the original dataset
df_merged = pd.merge(df_user_behavior_logs, user_purchase_frequency, on='user_id', how='left')

# Print the updated dataset
print(df_merged.head())


   user_id  item_id  cat_id  seller_id  brand_id time_stamp  action_type  \
0   309587     1446     611       2353    2144.0 1900-09-25            0   
1   309587     1446     611       2353    2144.0 1900-09-25            0   
2   201592     1450     229        142    1845.0 1900-07-17            0   
3   370402     1450     229        142    1845.0 1900-07-07            0   
4   248429     1450     229        142    1845.0 1900-06-18            0   

   purchase_frequency  
0                 NaN  
1                 NaN  
2                 NaN  
3                 NaN  
4                 1.0  


In [16]:
import pandas as pd

# Calculate the merchant popularity
merchant_popularity = df_user_behavior_logs.groupby('seller_id').size().reset_index(name='popularity')

# Merge the merchant popularity with the original dataset
df_merged = pd.merge(df_user_behavior_logs, merchant_popularity, on='seller_id', how='left')

# Print the updated dataset
print(df_merged.head())


   user_id  item_id  cat_id  seller_id  brand_id time_stamp  action_type  \
0   309587     1446     611       2353    2144.0 1900-09-25            0   
1   309587     1446     611       2353    2144.0 1900-09-25            0   
2   201592     1450     229        142    1845.0 1900-07-17            0   
3   370402     1450     229        142    1845.0 1900-07-07            0   
4   248429     1450     229        142    1845.0 1900-06-18            0   

   popularity  
0           2  
1           2  
2           7  
3           7  
4           7  


In [17]:
import pandas as pd

# Calculate the user purchase history
user_purchase_history = df_user_behavior_logs.groupby('user_id')['action_type'].apply(lambda x: (x == 2).sum()).reset_index(name='purchase_history')

# Merge the user purchase history with the original dataset
df_merged = pd.merge(df_user_behavior_logs, user_purchase_history, on='user_id', how='left')

# Print the updated dataset
print(df_merged.head())


   user_id  item_id  cat_id  seller_id  brand_id time_stamp  action_type  \
0   309587     1446     611       2353    2144.0 1900-09-25            0   
1   309587     1446     611       2353    2144.0 1900-09-25            0   
2   201592     1450     229        142    1845.0 1900-07-17            0   
3   370402     1450     229        142    1845.0 1900-07-07            0   
4   248429     1450     229        142    1845.0 1900-06-18            0   

   purchase_history  
0                 0  
1                 0  
2                 0  
3                 0  
4                 1  


In [18]:
import pandas as pd

# Merge the user behavior and user profile data based on user_id
df_merged = pd.merge(df_user_behavior_logs, df_user_profile, on='user_id')

# Calculate the interaction count by age group and gender
interaction_count = df_merged.groupby(['age_range', 'gender']).size().reset_index(name='interaction_count')

# Calculate the total count by age group
total_count_by_age = df_user_profile.groupby('age_range').size().reset_index(name='total_count')

# Merge the interaction count and total count by age group
df_ratio = pd.merge(interaction_count, total_count_by_age, on='age_range')

# Calculate the interaction ratio
df_ratio['interaction_ratio'] = df_ratio['interaction_count'] / df_ratio['total_count']

# Print the interaction ratio by age group and gender
print(df_ratio)


    age_range  gender  interaction_count  total_count  interaction_ratio
0         0.0     0.0                 13        92914           0.000140
1         0.0     1.0                  1        92914           0.000011
2         2.0     0.0                  6        52871           0.000113
3         3.0     0.0                 25       111654           0.000224
4         3.0     1.0                  4       111654           0.000036
5         3.0     2.0                  5       111654           0.000045
6         4.0     0.0                 42        79991           0.000525
7         4.0     1.0                 29        79991           0.000363
8         4.0     2.0                  5        79991           0.000063
9         5.0     0.0                 56        40777           0.001373
10        5.0     1.0                 10        40777           0.000245
11        6.0     0.0                 13        35464           0.000367
12        6.0     1.0                  4        354

In [19]:
import pandas as pd

# Convert 'time_stamp' column to datetime type
df_user_behavior_logs['time_stamp'] = pd.to_datetime(df_user_behavior_logs['time_stamp'], format='%m%d')

# Group the data by user and merchant and calculate the maximum purchase date
purchase_recency = df_user_behavior_logs.groupby(['user_id', 'seller_id'])['time_stamp'].max().reset_index(name='latest_purchase_date')

# Calculate the purchase recency in days from the latest date
latest_date = df_user_behavior_logs['time_stamp'].max()
purchase_recency['purchase_recency'] = (latest_date - purchase_recency['latest_purchase_date']).dt.days

# Print the purchase recency for each user and merchant
print(purchase_recency)


     user_id  seller_id latest_purchase_date  purchase_recency
0       2799       4044           1900-07-29               105
1       2859       1679           1900-09-12                60
2       4351        790           1900-06-28               136
3       6230       1606           1900-10-24                18
4       6539       2336           1900-11-11                 0
..       ...        ...                  ...               ...
148   405751       3727           1900-11-06                 5
149   406287        543           1900-06-13               151
150   410585       1674           1900-05-24               171
151   413594       2617           1900-11-11                 0
152   421338        286           1900-11-09                 2

[153 rows x 4 columns]


In [20]:
import pandas as pd

# Load the data into a pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/PR Project/user_log_format1.csv')

# Generate the statistical summary
statistics_summary = df.describe()

# Print the statistical summary
print(statistics_summary)


             user_id      item_id       cat_id    seller_id     brand_id  \
count     227.000000   227.000000   227.000000   227.000000   226.000000   
mean   200388.585903  1548.277533   821.312775  1832.317181  4239.017699   
std    121242.614659    47.637622   379.871594  1237.439465  2693.743982   
min      2799.000000  1446.000000   119.000000     1.000000    82.000000   
25%     92113.000000  1521.000000   662.000000   543.000000  1662.000000   
50%    203647.000000  1559.000000   737.000000  2161.000000  4382.000000   
75%    289349.000000  1592.000000  1198.000000  2617.000000  6483.000000   
max    421338.000000  1609.000000  1596.000000  4940.000000  8420.000000   

        time_stamp  action_type  
count   227.000000   227.000000  
mean    963.299559     0.246696  
std     183.566940     0.758973  
min     521.000000     0.000000  
25%     864.500000     0.000000  
50%    1030.000000     0.000000  
75%    1110.000000     0.000000  
max    1111.000000     3.000000  
