## Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

## Loading Dataset

In [3]:
def load_dataset(dataset_name, dataset_path = "Dataset\\"):
    df = pd.read_csv(dataset_path + dataset_name)
    return df
    
customers_df = load_dataset(dataset_name = "Customers.csv")
products_df = load_dataset(dataset_name = "Products.csv")
transactions_df = load_dataset(dataset_name = "Transactions.csv")

### Combining Customer Transaction History Info

In [4]:
transactions_details = transactions_df.groupby('CustomerID').agg(TotalAmount = ('TotalValue', 'sum'), TotalTransactions = ('TransactionID', 'count'), TotalQuantity = ('Quantity', 'sum')).reset_index()
dataset = pd.merge(customers_df, transactions_details, on = 'CustomerID', how = 'inner')

dummyDataset = pd.get_dummies(dataset['Region'], drop_first = True)
dataset = pd.concat([dummyDataset, dataset], axis = 1)
ids = dataset['CustomerID']
dataset.drop(['Region', 'CustomerID', 'CustomerName', 'SignupDate'], axis = 1, inplace = True)

scaler = StandardScaler()
scaledData = scaler.fit_transform(dataset)
scaledDf = pd.DataFrame(scaledData, columns = ['Europe', 'North America', 'South America',  'TotalAmount', 'TotalTransactions', 'TotalQuantity'])

### Using Cosine Similarity Metric

Find similarity between Customers

In [5]:
matrix = cosine_similarity(scaledDf)


In [6]:
def get_lookalikes(customer_id):
    customer_index = customers_df[customers_df['CustomerID'] == customer_id].index[0]
    similarities = matrix[customer_index]
    
    #Get highest score [Desc order]
    similar_customers = np.argsort(similarities)[::-1][1: 4]
    
    top_lookalikes = customers_df.iloc[similar_customers][['CustomerID']]
    top_lookalikes['SimilarityScore'] = similarities[similar_customers]
    return top_lookalikes[['CustomerID', 'SimilarityScore']]


In [7]:
lookalike_list = []
for customer_id in customers_df['CustomerID'][:20]:
    lookalike_list.append(get_lookalikes(customer_id))

lookalike_df = pd.DataFrame(columns = ['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

for i, row in customers_df[:20].iterrows():
    customer_id = row['CustomerID']
    lookalike_df = pd.concat([lookalike_df, pd.DataFrame({'CustomerID': [customer_id] * 3, 'LookalikeCustomerID': lookalike_list[i]['CustomerID'],'SimilarityScore': lookalike_list[i]['SimilarityScore']})], ignore_index=True)

lookalike_df.to_csv('Lookalike.csv', index=False)