In [40]:
import pandas as pd

In [41]:
# Loading the similarity DataFrame
similarity_df = pd.read_csv("../output/lookalike_results.csv", index_col=0)

In [42]:
# Preparing a structured DataFrame for top N similar customers
top_n = 3  # Number of lookalikes to display
lookalike_results = []

In [43]:
for customer_id in similarity_df.index:
    # Getting similarity scores for this customer, sorted by highest similarity
    scores = similarity_df.loc[customer_id].dropna().sort_values(ascending=False)
    
    # Excluding the customer themselves
    scores = scores[scores.index != customer_id]
    
    # Extracting top N similar customers
    top_customers = scores.head(top_n)
    
    # Building a row for the output DataFrame
    row = [customer_id]
    for similar_customer, score in top_customers.items():
        row.append(similar_customer)
        row.append(score)
    
    # Appending the row to the results
    lookalike_results.append(row)

In [44]:
# Creating the final DataFrame
columns = ["CustomerID"]
for i in range(1, top_n + 1):
    columns.extend([f"SimilarCustomer{i}", f"Score{i}"])

lookalike_df = pd.DataFrame(lookalike_results, columns=columns)

# Saving the refined results to a CSV file
lookalike_df.to_csv("../output/refined_lookalike_results.csv", index=False)

# Displaying the top rows of the refined DataFrame
print(lookalike_df.head())

  CustomerID SimilarCustomer1  Score1 SimilarCustomer2  Score2  \
0      C0001            C0137     1.0            C0152     1.0   
1      C0002            C0029     1.0            C0199     1.0   
2      C0010            C0029     1.0            C0002     1.0   
3      C0003            C0178     1.0            C0035     1.0   
4      C0004            C0021     1.0            C0101     1.0   

  SimilarCustomer3  Score3  
0            C0056     1.0  
1            C0031     1.0  
2            C0025     1.0  
3            C0133     1.0  
4            C0145     1.0  
