In [3]:
import pandas as pd

# Load datasets
customers = pd.read_csv('Downloads/Customers.csv')
products = pd.read_csv('Downloads/Products.csv')
transactions = pd.read_csv('Downloads/Transactions.csv')

# Preview datasets
print(customers.head())
print(products.head())
print(transactions.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [4]:
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',      # Total spending
    'TransactionID': 'count' # Number of transactions
}).reset_index()

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
normalized_data = scaler.fit_transform(customer_profiles[['TotalValue', 'TransactionID']])

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(normalized_data)

lookalikes = {}
for i in range(len(similarity_matrix)):
    similar_customers = similarity_matrix[i].argsort()[-4:-1]
    scores = similarity_matrix[i][similar_customers]
    lookalikes[customer_profiles['CustomerID'][i]] = list(zip(customer_profiles['CustomerID'][similar_customers], scores))

In [10]:
print(lookalikes)

{'C0001': [('C0056', np.float64(0.9939465070361473)), ('C0152', np.float64(0.9976828713249932)), ('C0137', np.float64(0.999567362853379))], 'C0002': [('C0010', np.float64(0.9993855771185838)), ('C0199', np.float64(0.9994902392200663)), ('C0029', np.float64(0.9998164613385201))], 'C0003': [('C0144', np.float64(0.999981447028915)), ('C0150', np.float64(0.9999900049519272)), ('C0095', np.float64(0.9999989538238129))], 'C0004': [('C0075', np.float64(0.999625100734789)), ('C0021', np.float64(0.9998655690130588)), ('C0067', np.float64(0.9999925612108973))], 'C0005': [('C0150', np.float64(0.9999778956667101)), ('C0144', np.float64(0.9999873569686522)), ('C0130', np.float64(0.9999953849248338))], 'C0006': [('C0196', np.float64(0.945939603614914)), ('C0117', np.float64(0.9893342566876119)), ('C0079', np.float64(0.9998805962028247))], 'C0007': [('C0085', np.float64(0.9999627719734157)), ('C0193', np.float64(0.9999679390437886)), ('C0092', np.float64(0.999998129945819))], 'C0008': [('C0154', np.f

In [13]:
import os

# Create the directory if it doesn't exist
output_dir = 'outputs'
os.makedirs(output_dir, exist_ok=True)

In [15]:
lookalike_df.to_csv(f'{output_dir}/FirstName_LastName_Lookalike.csv', index=False)

In [16]:
import os
import pandas as pd

# Create directory if it doesn't exist
output_dir = 'outputs'
os.makedirs(output_dir, exist_ok=True)

# Process data (assuming `lookalikes` is already defined)
lookalike_list = []
for customer_id, recommendations in lookalikes.items():
    for similar_id, score in recommendations:
        lookalike_list.append((customer_id, similar_id, score))

# Create DataFrame
lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'Similar_CustomerID', 'Score'])

# Save to CSV
lookalike_df.to_csv(f'{output_dir}/FirstName_LastName_Lookalike.csv', index=False)

In [17]:
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA

# Apply KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
customer_profiles['Cluster'] = kmeans.fit_predict(normalized_data)

# Calculate Davies-Bouldin Index
db_index = davies_bouldin_score(normalized_data, customer_profiles['Cluster'])
print(f'Davies-Bouldin Index: {db_index}')


Davies-Bouldin Index: 0.8595340221510472


In [18]:
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(normalized_data)

plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=customer_profiles['Cluster'], cmap='viridis')
plt.title('Customer Segments')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()


NameError: name 'plt' is not defined