In [91]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler        
from sklearn.metrics.pairwise import cosine_similarity

In [102]:
features_acc = pd.read_csv("feature/features_account.csv")
account = pd.read_csv("data/account.csv")

In [103]:
features_acc.head()

Unnamed: 0,GROWTH_RATE__C,TOTAL_FUNDING_TO_DATE__C,NUMBEROFEMPLOYEES,OWNER_INTENT_TO_SELL__C,ID_ACC,ID_OPP,TYPE_OPP,AMOUNT,STAGENAME,NAME_OPP,...,INDUSTRY_Technology & Communications,HQ_LOCATION__C_Midwest,HQ_LOCATION__C_Northeast,HQ_LOCATION__C_Southeast,HQ_LOCATION__C_Southwest,HQ_LOCATION__C_West,RANGE_ANNUALREVENUE_High,RANGE_ANNUALREVENUE_Low,RANGE_ANNUALREVENUE_Lower-Middle,RANGE_ANNUALREVENUE_Upper-Middle
0,151.0,10257660.0,92,0,001ak00000JD8JNAA1,006ak000002wGT4AAM,0,2381242.0,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,6.0,30923450.0,254,0,001ak00000JD8JOAA1,006ak000002wGT6AAM,0,8111069.0,1,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,77.0,900207200.0,4593,0,001ak00000JD8JPAA1,006ak000002wGT7AAM,0,137786800.0,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,121.0,161135500.0,650,0,001ak00000JD8JQAA1,006ak000002wGT8AAM,0,291083400.0,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,241.0,33364860.0,298,0,001ak00000JD8JRAA1,006ak000002wGT9AAM,0,9532817.0,1,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [104]:
numerical_cols = ['GROWTH_RATE__C', 'TOTAL_FUNDING_TO_DATE__C', 'NUMBEROFEMPLOYEES', 'AMOUNT']

categorical_cols = [
    'OWNER_INTENT_TO_SELL__C', 'TYPE_OPP', 'STAGENAME', 'NAME_OPP',
    'INDUSTRY_Technology & Communications', 'HQ_LOCATION__C_Midwest',
    'HQ_LOCATION__C_Northeast', 'HQ_LOCATION__C_Southeast', 'HQ_LOCATION__C_Southwest',
    'HQ_LOCATION__C_West', 'RANGE_ANNUALREVENUE_High', 'RANGE_ANNUALREVENUE_Low',
    'RANGE_ANNUALREVENUE_Lower-Middle', 'RANGE_ANNUALREVENUE_Upper-Middle'
]

In [105]:
scaler = StandardScaler()
features_acc[numerical_cols] = scaler.fit_transform(features_acc[numerical_cols])

In [112]:
user_values = {
    # 'HQ_LOCATION__C_Southwest': 1,
    'RATING_Hot': 1,
#     'NAME_OPP': 1,
#     'INDUSTRY_Industrial & Other': 1,
#     'TYPE_OPP': 1,
    'TYPE_ACC_Startup': 1,
    'OWNER_INTENT_TO_SELL__C': 1
}

In [113]:
features = features_acc[user_values.keys()]

In [114]:
def create_user_profile(user_values):
    # Create a new profile with default values (zeros)
    profile = pd.DataFrame(np.zeros((1, len(features_acc.columns))), columns=features_acc.columns)
    
    # Update the profile with user-defined values
    for key, value in user_values.items():
        if key in profile.columns:
            profile.at[0, key] = value
        else:
            print(f"Warning: {key} is not a valid column name.")
    
    # Normalize numerical features
    profile[numerical_cols] = scaler.transform(profile[numerical_cols])
    
    # Select only the feature columns
    profile_features = profile[user_values.keys()]
    
    return profile_features

def get_recommendations(profile_features, top_n=5):
    # Calculate similarity between the profile and all existing accounts
    profile_similarity = cosine_similarity(profile_features, features)
    
    # Get similarity scores for the profile
    similarity_scores = list(enumerate(profile_similarity[0]))
    
    # Sort by similarity score in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top N similar accounts
    top_indices = [idx for idx, score in similarity_scores[:top_n]]
    
    # Return the IDs of the top N similar accounts
    return features_acc.iloc[top_indices]['ID_ACC'].values

In [115]:
profile_features = create_user_profile(user_values)
recommendations = get_recommendations(profile_features)

In [116]:
recommendations

array(['001ak00000JD8MfAAL', '001ak00000JD8MhAAL', '001ak00000JD8MkAAL',
       '001ak00000JD8NHAA1', '001ak00000JD8NKAA1'], dtype=object)

In [117]:
account[account.ID.isin(recommendations)]

Unnamed: 0,HQ_LOCATION__C,GROWTH_RATE__C,TOTAL_FUNDING_TO_DATE__C,YEARSTARTED,ACCOUNTSOURCE,ANNUALREVENUE,RATING,NUMBEROFEMPLOYEES,OWNERSHIP,INDUSTRY,TYPE,NAME,OWNER_INTENT_TO_SELL__C,ID
14,Colorado,138.0,791171.67,2023,Direct Referrals,1551317.0,Hot,8,Subsidiary,Telecommunications,Startup,"Daniel, Lawson and Lane Enterprises",True,001ak00000JD8MfAAL
16,Nebraska,203.0,7199251.45,2022,Industry Conferences,4965001.0,Hot,25,Public,Banking,Startup,Davenport and Sons Industries,True,001ak00000JD8MhAAL
19,Arizona,70.0,6798792.0,2023,Direct Referrals,8716400.0,Hot,44,Subsidiary,Media,Startup,"Wiley, Massey and Kaufman Industries",True,001ak00000JD8MkAAL
52,Maryland,244.0,1617240.06,2022,Industry Conferences,2994889.0,Hot,15,Public,Apparel,Startup,"Mathis, Whitney and Reed Industries",True,001ak00000JD8NHAA1
55,New Hampshire,162.0,5296771.24,2023,Investment Banks,9993908.0,Hot,50,Public,Communications,Startup,Rojas-Roman Industries,True,001ak00000JD8NKAA1
