In [1]:
import pandas as pd
import ast
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import itertools

In [2]:
def get_data(merchant_name):
    df_full = pd.read_csv("synthetic_transaction_data_Dining_SMALL_w_items.csv")
    df_full['items'] = [", ".join(ast.literal_eval(x)) for x in list(df_full['items'])]
    df = df_full[df_full['merchant_name'] == merchant_name].copy()
    return df 

In [None]:
df_md = get_data("McDonald's")
df_md.head()

In [None]:
df_sw = get_data('Subway')
df_sw.head()

In [None]:
print(df_sw.columns)

In [6]:
df = df_md
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
NOW = df['transaction_date'].max()
rfmTable = df.groupby('cardholder_name').agg({'transaction_date': lambda x: (NOW - x.max()).days, 'transaction_id': lambda x: len(x), 'transaction_amount': lambda x: x.sum()})
rfmTable['transaction_date'] = rfmTable['transaction_date'].astype(int)
rfmTable.rename(columns={'transaction_date': 'recency', 
                         'transaction_id': 'frequency',
                         'transaction_amount': 'monetary_value'}, inplace=True)
rfmTable = rfmTable.reset_index()

In [7]:
rfmTable['r_quartile'] = pd.qcut(rfmTable['recency'], q=4, labels=range(1,5), duplicates='raise')
rfmTable['f_quartile'] = pd.qcut(rfmTable['frequency'], q=4, labels=range(1,5), duplicates='drop')
rfmTable['m_quartile'] = pd.qcut(rfmTable['monetary_value'], q=4, labels=range(1,5), duplicates='drop')
rfm_data = rfmTable.reset_index()

In [8]:
rfm_data['r_quartile'] = rfm_data['r_quartile'].astype(str)
rfm_data['f_quartile'] = rfm_data['f_quartile'].astype(str)
rfm_data['m_quartile'] = rfm_data['m_quartile'].astype(str)
rfm_data['RFM_score'] = rfm_data['r_quartile'] + rfm_data['f_quartile'] + rfm_data['m_quartile']
rfm_data = rfm_data.reset_index()

In [9]:
rfm_data['customer_segment'] = 'Other'

rfm_data.loc[rfm_data['RFM_score'].isin(['334', '443', '444', '344', '434', '433', '343', '333']), 'customer_segment'] = 'Premium Customer' #nothing <= 2
rfm_data.loc[rfm_data['RFM_score'].isin(['244', '234', '232', '332', '143', '233', '243']), 'customer_segment'] = 'Repeat Customer' # f >= 3 & r or m >=3
rfm_data.loc[rfm_data['RFM_score'].isin(['424', '414', '144', '314', '324', '124', '224', '423', '413', '133', '323', '313', '134']), 'customer_segment'] = 'Top Spender' # m >= 3 & f or m >=3
rfm_data.loc[rfm_data['RFM_score'].isin([ '422', '223', '212', '122', '222', '132', '322', '312', '412', '123', '214']), 'customer_segment'] = 'At Risk Customer' # two or more  <=2
rfm_data.loc[rfm_data['RFM_score'].isin(['411','111', '113', '114', '112', '211', '311']), 'customer_segment'] = 'Inactive Customer' # two or more  =1

del rfm_data['index']

In [None]:
rfm_data.head()

In [None]:
print(Counter(rfm_data['customer_segment']))

In [12]:
def generate_recommendations(target_customer, cohort, num_recommendations=10):
    user_item_matrix = cohort.groupby('cardholder_name')['items'].apply(lambda x: list(set([', '.join(x)]))).reset_index()
    user_item_matrix['items'] = [', '.join(list(set(list(x[0].split(", "))))) for x in list(user_item_matrix['items'])]
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(user_item_matrix['items'])

    similarity_matrix = cosine_similarity(tfidf_matrix)    
    
    target_customer_index = user_item_matrix[user_item_matrix['cardholder_name'] == target_customer].index[0]
    similar_customers = similarity_matrix[target_customer_index].argsort()[::-1][1:num_recommendations+1]

    target_customer_purchases = set(user_item_matrix[user_item_matrix['cardholder_name'] == target_customer]['items'].iloc[0])
    
    recommendations = []
    
    for customer_index in similar_customers:
        customer_purchases = set([user_item_matrix.iloc[customer_index]['items']])
        new_items = customer_purchases.difference(target_customer_purchases)
        recommendations.extend(new_items)

    recommendations = [item.split(', ') for item in recommendations]
    recommendations = list(itertools.chain.from_iterable(recommendations))
    return recommendations


In [13]:
def customer_analysis(name, cohort):    
    recommendations = generate_recommendations(name, cohort, num_recommendations=10)
    df = cohort[cohort['cardholder_name'] == name]
    already = list(df['items'])
    already = [item.split(', ') for item in already]
    already = list(itertools.chain.from_iterable(already))
    recs = set(recommendations) - set(already)
    return recs

In [14]:
rfm_data = rfm_data[rfm_data['customer_segment']== 'Premium Customer']
premium = list(set(rfm_data['cardholder_name']))
df_premium = df[df['cardholder_name'].isin(premium)]

In [None]:
names = dict()
for name in list(df_premium['cardholder_name']):
    recs = customer_analysis(name, df_premium)
    if recs and len(recs) >= 2 and name not in names:
        names[name] = recs
        print(f"Recommendations for {name}: ", recs )

In [None]:
names = dict()
for name in list(df['cardholder_name']):
    recs = customer_analysis(name, df)
    if recs and len(recs) > 0 and name not in names:
        names[name] = recs
        print(f"Recommendations for {name}: ", recs )

In [18]:
df = df_sw
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
NOW = df['transaction_date'].max()
rfmTable = df.groupby('cardholder_name').agg({'transaction_date': lambda x: (NOW - x.max()).days, 'transaction_id': lambda x: len(x), 'transaction_amount': lambda x: x.sum()})
rfmTable['transaction_date'] = rfmTable['transaction_date'].astype(int)
rfmTable.rename(columns={'transaction_date': 'recency', 
                         'transaction_id': 'frequency',
                         'transaction_amount': 'monetary_value'}, inplace=True)
rfmTable = rfmTable.reset_index()

In [19]:
rfmTable['r_quartile'] = pd.qcut(rfmTable['recency'], q=4, labels=range(1,5), duplicates='raise')
rfmTable['f_quartile'] = pd.qcut(rfmTable['frequency'], q=4, labels=range(1,5), duplicates='drop')
rfmTable['m_quartile'] = pd.qcut(rfmTable['monetary_value'], q=4, labels=range(1,5), duplicates='drop')
rfm_data = rfmTable.reset_index()

In [20]:
rfm_data['r_quartile'] = rfm_data['r_quartile'].astype(str)
rfm_data['f_quartile'] = rfm_data['f_quartile'].astype(str)
rfm_data['m_quartile'] = rfm_data['m_quartile'].astype(str)
rfm_data['RFM_score'] = rfm_data['r_quartile'] + rfm_data['f_quartile'] + rfm_data['m_quartile']
rfm_data = rfm_data.reset_index()

In [21]:
rfm_data['customer_segment'] = 'Other'

rfm_data.loc[rfm_data['RFM_score'].isin(['334', '443', '444', '344', '434', '433', '343', '333']), 'customer_segment'] = 'Premium Customer' #nothing <= 2
rfm_data.loc[rfm_data['RFM_score'].isin(['244', '234', '232', '332', '143', '233', '243']), 'customer_segment'] = 'Repeat Customer' # f >= 3 & r or m >=3
rfm_data.loc[rfm_data['RFM_score'].isin(['424', '414', '144', '314', '324', '124', '224', '423', '413', '133', '323', '313', '134']), 'customer_segment'] = 'Top Spender' # m >= 3 & f or m >=3
rfm_data.loc[rfm_data['RFM_score'].isin([ '422', '223', '212', '122', '222', '132', '322', '312', '412', '123', '214']), 'customer_segment'] = 'At Risk Customer' # two or more  <=2
rfm_data.loc[rfm_data['RFM_score'].isin(['411','111', '113', '114', '112', '211', '311']), 'customer_segment'] = 'Inactive Customer' # two or more  =1

del rfm_data['index']

In [22]:
rfm_data = rfm_data[rfm_data['customer_segment']== 'Premium Customer']
premium = list(set(rfm_data['cardholder_name']))
df_premium = df[df['cardholder_name'].isin(premium)]

In [None]:
names = dict()
for name in list(df_premium['cardholder_name']):
    recs = customer_analysis(name, df_premium)
    if recs and len(recs) >= 2 and name not in names:
        names[name] = recs
        print(f"Recommendations for {name}: ", recs )

In [None]:
names = dict()
customers = list(set(df['cardholder_name']))
for name in customers:
    recs = customer_analysis(name, df)
    if recs and len(recs) >= 1 and name not in names:
        names[name] = recs
        print(f"Recommendations for {name}: ", recs )