In [44]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [45]:
df = pd.read_csv(r"C:\Users\223099055.HCAD\Downloads\DANA\dana_transactions.csv")
df = df[(df['transaction_status'] == 'completed')]

In [46]:
personality = {}

with open('personality.pickle', 'rb') as f:
    personality = pickle.load(f)

# Openness

In [None]:
n_merchant_per_user = df.groupby(['user_id'])['merchant_id'].nunique()

std_n_merchant = n_merchant_per_user.std()
mean_n_merchant = n_merchant_per_user.mean()

In [None]:
for user in df['user_id'].unique():
    df_u = df[df['user_id'] == user].copy()
    openness = 0.5
    
    # based on number of merchants per user
    n = df_u['merchant_id'].nunique()
    if n > mean_n_merchant + std_n_merchant:
        openness += 0.2
    elif n > mean_n_merchant:
        openness += 0.05
    elif n < mean_n_merchant - std_n_merchant:
        openness -= 0.05

    # based on payment method
    if 'balance' in df_u['payment_method'].unique():
        openness += 0.05

    # based on loyalty program
    if 'yes' in df['loyalty_program'].unique():
        openness -= 0.1

    if user in personality:
        personality[user]['openness'] = openness
    else:
        personality[user] = {
            'openness': openness
        }    

In [None]:
with open('personality.pickle', 'wb') as f:
    pickle.dump(personality, f, protocol=pickle.HIGHEST_PROTOCOL)

# Conscientiousness

In [None]:
df = df[(df['transaction_status'] == 'completed') & (df['is_refunded'] == 'no')]

In [None]:
def check_transaction_stability(transactions, threshold=10000):
    stable = True
    max_value = np.max(transactions)
    for i in a:
        dif = max_value - i
        if dif > threshold:
            stable = False
            break
    return stable

In [None]:
for user in df['user_id'].unique():
    df_u = df[df['user_id'] == user].copy()
    conscien = 0.5

    # based on the stability of total transaction amount per week
    df_u['new_date'] = pd.to_datetime(df_u['transaction_date']) - pd.to_timedelta(7, unit='d')
    df_tran_per_week = df_u.groupby([pd.Grouper(key='new_date', freq='W-MON')])['transaction_amount'].sum().reset_index().sort_values('new_date')
    tran_per_week = df_tran_per_week[df_tran_per_week['transaction_amount'] > 0]['transaction_amount'].to_numpy()
    stable_tran = check_transaction_stability(tran_per_week)
    if stable_tran:
        conscien += 0.3
    
    # based on number of transactions with notes
    n_tran_with_note = len(df_u[df_u['transaction_notes'] == 'yes'])
    if n_tran_with_note / len(df_u) > 0.5:
        conscien += 0.1
    
    if user in personality:
        personality[user]['conscientiousness'] = conscien
    else:
        personality[user] = {
            'conscientiousness': conscien
        }

In [None]:
with open('personality.pickle', 'wb') as f:
    pickle.dump(personality, f, protocol=pickle.HIGHEST_PROTOCOL)

# Extroversion 

In [47]:
df = df[(df['transaction_status'] == 'completed') & (df['is_refunded'] == 'no')]

In [49]:
for user in df['user_id'].unique():
    df_u = df[df['user_id'] == user].copy()
    extro = 0.5

    std = df_u['transaction_amount'].std()
    avg = df_u['transaction_amount'].mean()
    n_impulsive_buy = len(df_u[df_u['transaction_amount'] > avg + std])
    if n_impulsive_buy >= 6:
        extro += 0.25
    elif n_impulsive_buy >= 3:
        extro += 0.15

    if user in personality:
        personality[user]['extroversion'] = extro
    else:
        personality[user] = {
            'extroversion': extro
        }

In [None]:
with open('personality.pickle', 'wb') as f:
    pickle.dump(personality, f, protocol=pickle.HIGHEST_PROTOCOL)

# Agreeableness

In [19]:
merchant_popularity = {}

for m in df['merchant_id'].unique():
    users = df[df['merchant_id'] == m]['user_id'].unique()
    n_user = len(users)

    if n_user > 240: # 240 is 1% of total user
        label = 'very_popular'        
    elif n_user > 48: # 48 is 0.2% of total user
        label = 'popular'
    elif n_user > 24:
        label = 'normal'
    else:
        label = 'not_popular'

    merchant_popularity[m] = label

In [23]:
for user in df['user_id'].unique():
    df_u = df[df['user_id'] == user].copy()
    agree = 0.5

    # based on the popularity of merchant
    merchant_user = []
    for m in df_u['merchant_id'].unique():
        merchant_user.append(merchant_popularity[m])
    merchant_user = pd.Series(merchant_user)
    sorted_popularity = merchant_user.groupby(merchant_user).size().sort_values(ascending=False)
    label = sorted_popularity.index[0]

    if label == 'very_popular':
        agree += 0.3
    elif label == 'popular':
        agree += 0.15
    elif label == 'not_popular':
        agree -= 0.15
        
    if user in personality:
        personality[user]['agreeableness'] = agree
    else:
        personality[user] = {
            'agreeableness': agree
        }

In [51]:
with open('personality.pickle', 'wb') as f:
    pickle.dump(personality, f, protocol=pickle.HIGHEST_PROTOCOL)

# Neuroticism

In [None]:
with open('personality.pickle', 'rb') as f:
    personality = pickle.load(f)

In [None]:
for user in df['user_id'].unique():
    df_u = df[df['user_id'] == user].copy()
    neuro = 0.5

     # based on number of refunded transactions
    n_tran_refund = len(df_u[df_u['is_refunded'] == 'yes'])
    if n_tran_refund > 1:
        neuro += 0.1
    elif n_tran_refund < 1:
        neuro -= 0.1

    # based on number of transactions with merchant rating one (very bad) 
    n_rating_one = len(df_u[df_u['merchant_rating'] == 1])
    if n_rating_one >= 3:
        neuro += 0.1
    elif n_rating_one < 1:
        neuro -= 0.1

    if user in personality:
        personality[user]['neuroticism'] = neuro
    else:
        personality[user] = {
            'neuroticism': neuro
        }

In [None]:
with open('personality.pickle', 'wb') as f:
    pickle.dump(personality, f, protocol=pickle.HIGHEST_PROTOCOL)