In [101]:
import pandas as pd
import numpy as np
# df = pd.read_csv('data/cleaned_data.csv')

df = pd.read_csv('cleaned_data.csv')
    
# Add age groups
df['age_group'] = df['Age'].apply(lambda x: 
    'Gen Z' if x < 25 else 
    'Millennials' if x < 40 else 
    'Gen X' if x < 55 else 'Boomers'
)

# Add tenure groups
df['tenure_group'] = df['Tenure'].apply(lambda x:
    'New (0-2 years)' if x <= 2 else
    'Mid (3-7 years)' if x <= 7 else
    'Senior (8+ years)'
)

In [102]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score, classification_report, silhouette_score
import warnings
warnings.filterwarnings('ignore')

# Create ML-ready dataset
ml_data = df.copy()

# Remove non-predictive columns
drop_cols = ['BenefitID', 'LastUsedDate', 'Comments', 'Month']
ml_data = ml_data.drop([col for col in drop_cols if col in ml_data.columns], axis=1)

# Encode categorical variables
categorical_cols = ['Gender', 'Department', 'age_group', 'tenure_group', 'BenefitType', 'BenefitSubType']
label_encoders = {}

for col in categorical_cols:
    if col in ml_data.columns:
        le = LabelEncoder()
        ml_data[f'{col}_encoded'] = le.fit_transform(ml_data[col].astype(str))
        label_encoders[col] = le
        ml_data = ml_data.drop(col, axis=1)

# Handle missing values for numeric columns only
numeric_cols = ml_data.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    ml_data[col] = ml_data[col].fillna(ml_data[col].median())

# Handle any remaining non-numeric columns
non_numeric_cols = ml_data.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_cols:
    if col in ml_data.columns:
        # For non-numeric columns, either encode them or drop them
        if ml_data[col].dtype == 'object':
            le = LabelEncoder()
            ml_data[f'{col}_encoded'] = le.fit_transform(ml_data[col].astype(str))
            ml_data = ml_data.drop(col, axis=1)

# Create target variables for different prediction tasks
# 1. Usage prediction (regression)
ml_data['usage_target'] = ml_data['UsageFrequency']

# 2. High engagement prediction (classification) 
ml_data['high_engagement_target'] = (ml_data['UsageFrequency'] >= ml_data['UsageFrequency'].quantile(0.7)).astype(int)

# 3. Satisfaction prediction (regression)
ml_data['satisfaction_target'] = ml_data['SatisfactionScore']

# Prepare feature matrix (exclude target variables and benefit subtype one-hot columns for now)
feature_cols = [col for col in ml_data.columns if not col.startswith(('usage_target', 'high_engagement_target', 'satisfaction_target', 'subcat_', 'UsageFrequency', 'SatisfactionScore'))]
X = ml_data[feature_cols]

# Ensure all columns are numeric
X = X.select_dtypes(include=[np.number])

# Scale numerical features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

print("ML Data Preparation Complete:")
print(f"- Features shape: {X_scaled.shape}")
print(f"- Target variables created: usage_target, high_engagement_target, satisfaction_target")
print(f"- Categorical variables encoded: {list(label_encoders.keys())}")
print("\nFeature columns:")
print(X_scaled.columns.tolist())

ML Data Preparation Complete:
- Features shape: (7626, 10)
- Target variables created: usage_target, high_engagement_target, satisfaction_target
- Categorical variables encoded: ['Gender', 'Department', 'age_group', 'tenure_group', 'BenefitType', 'BenefitSubType']

Feature columns:
['EmployeeID', 'Age', 'Tenure', 'BenefitCost', 'Gender_encoded', 'Department_encoded', 'age_group_encoded', 'tenure_group_encoded', 'BenefitType_encoded', 'BenefitSubType_encoded']


In [103]:
cluster_features = ['age_group_encoded', 'tenure_group_encoded', 'Department_encoded']

available_features = [col for col in cluster_features if col in ml_data.columns]

cluster_data = ml_data[cluster_features].fillna(ml_data[cluster_features].median())

cluster_scaler = StandardScaler()
cluster_data_scaled = cluster_scaler.fit_transform(cluster_data)

# Determine optimal number of clusters using silhouette score
silhouette_scores = []
k_range = range(2, 8)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(cluster_data_scaled)
    silhouette_avg = silhouette_score(cluster_data_scaled, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Find optimal k
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_k}")
print(f"Silhouette score: {max(silhouette_scores):.3f}")

# Apply K-Means with optimal k
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42)
ml_data['employee_segment'] = kmeans_final.fit_predict(cluster_data_scaled)
df['employee_segment'] = ml_data['employee_segment']

# Analyze segments
segment_analysis = df.groupby('employee_segment').agg({
    'Age': 'mean',
    'Tenure': 'mean'
}).round(2)

print("\nEmployee Segment Analysis:")
print(segment_analysis)

Optimal number of clusters: 6
Silhouette score: 0.438

Employee Segment Analysis:
                    Age  Tenure
employee_segment               
0                 28.45    4.02
1                 33.41    4.21
2                 50.43   19.22
3                 32.68    9.52
4                 52.93    4.64
5                 52.54   19.87


In [104]:
# Enhanced Cluster Validation and Characterization
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt

print("\n" + "="*80)
print("CLUSTER VALIDATION & CHARACTERIZATION")
print("="*80)

# 1. Comprehensive Cluster Validation
print("\n1. CLUSTER VALIDATION METRICS")
print("-" * 40)

# Calculate Davies-Bouldin Index (lower is better)
db_score = davies_bouldin_score(cluster_data_scaled, ml_data['employee_segment'])
silhouette_score_final = silhouette_score(cluster_data_scaled, ml_data['employee_segment'])

print(f"Silhouette Score: {silhouette_score_final:.4f} (higher is better, range: -1 to 1)")
print(f"Davies-Bouldin Index: {db_score:.4f} (lower is better)")

# Determine cluster quality
if silhouette_score_final > 0.5:
    quality = "Excellent"
elif silhouette_score_final > 0.3:
    quality = "Good"
elif silhouette_score_final > 0.1:
    quality = "Fair"
else:
    quality = "Poor"

print(f"Cluster Quality Assessment: {quality}")

# 2. Detailed Cluster Profiling
print(f"\n2. DETAILED CLUSTER PROFILES")
print("-" * 40)

segment_top = []
segment_bot = []
seg_rec = []

for cluster_id in sorted(df['employee_segment'].unique()):
    cluster_data = df[df['employee_segment'] == cluster_id]
    cluster_size = len(cluster_data)
    
    print(f"\n📊 CLUSTER {cluster_id} (n={cluster_size}, {cluster_size/len(df)*100:.1f}% of employees)")
    print("=" * 60)
    
    # Demographics
    print("👥 DEMOGRAPHICS:")
    print(f"  • Average Age: {cluster_data['Age'].mean():.1f} years")
    print(f"  • Average Tenure: {cluster_data['Tenure'].mean():.1f} years")
    print(f"  • Gender Distribution: {dict(cluster_data['Gender'].value_counts())}")
    
    # Department distribution
    dept_dist = cluster_data['Department'].value_counts().head(3)
    print(f"  • Top Departments: {dict(dept_dist)}")
    
    # Usage Patterns
    print("\n💼 USAGE PATTERNS:")
    print(f"  • Average Usage Frequency: {cluster_data['UsageFrequency'].mean():.2f}")
    print(f"  • Average Satisfaction: {cluster_data['SatisfactionScore'].mean():.2f}/5.0")
    #print(f"  • Average Cost per Use: ${cluster_data['cost_per_use'].mean():.2f}")
    #print(f"  • High Engagement Rate: {(cluster_data['high_engagement'] == True).mean()*100:.1f}%")
    
    # Top Benefits by Usage
    top_benefits = cluster_data.groupby('BenefitSubType')['UsageFrequency'].mean().sort_values(ascending=False).head(3)
    print(f"\n🏆 TOP 5 BENEFITS BY USAGE:")
    top3 = []
    for i, (benefit, usage) in enumerate(top_benefits.items(), 1):
        print(f"  {i}. {benefit}: {usage:.2f} avg usage")
        top3.append(benefit)
    
    segment_top.append(top3)
    
    # Low Usage Benefits (potential for awareness campaigns)
    low_usage = cluster_data.groupby('BenefitSubType')['UsageFrequency'].mean().sort_values().head(3)
    print(f"\n⚠️  LOWEST USAGE BENEFITS (Awareness Opportunity):")
    bot3 = []
    for i, (benefit, usage) in enumerate(low_usage.items(), 1):
        print(f"  {i}. {benefit}: {usage:.2f} avg usage")
        bot3.append(benefit)
    
    segment_bot.append(bot3)

    # Dominant Benefit Category
    category_usage = cluster_data.groupby('BenefitType')['UsageFrequency'].mean().sort_values(ascending=False)
    dominant_category = category_usage.index[0]
    print(f"\n🎯 DOMINANT CATEGORY: {dominant_category} ({category_usage.iloc[0]:.2f} avg usage)")
    
    # Upselling Opportunities
    print(f"\n💡 UPSELLING OPPORTUNITIES:")
    # Find benefits with high usage that could lead to related benefit recommendations
    high_usage_benefits = cluster_data.groupby('BenefitSubType')['UsageFrequency'].mean()
    high_usage_benefits = high_usage_benefits[high_usage_benefits > high_usage_benefits.quantile(0.8)]
    
    
    for benefit in high_usage_benefits.index[:3]:  # Top 3
        related_category = cluster_data[cluster_data['BenefitSubType'] == benefit]['BenefitType'].iloc[0]
        other_in_category = cluster_data[
            (cluster_data['BenefitType'] == related_category) & 
            (cluster_data['BenefitSubType'] != benefit)
        ]['BenefitSubType'].unique()
        
        if len(other_in_category) > 0:
            print(f"  • High usage of '{benefit}' → Consider promoting: {list(other_in_category)[:3]}")
        
    seg_rec.append(list(other_in_category)[:3])

    
    print("-" * 60)


CLUSTER VALIDATION & CHARACTERIZATION

1. CLUSTER VALIDATION METRICS
----------------------------------------
Silhouette Score: 0.4378 (higher is better, range: -1 to 1)
Davies-Bouldin Index: 0.8468 (lower is better)
Cluster Quality Assessment: Good

2. DETAILED CLUSTER PROFILES
----------------------------------------

📊 CLUSTER 0 (n=1032, 13.5% of employees)
👥 DEMOGRAPHICS:
  • Average Age: 28.5 years
  • Average Tenure: 4.0 years
  • Gender Distribution: {'Female': 503, 'Male': 493, 'Non-Binary': 36}
  • Top Departments: {'Finance': 412, 'IT': 356, 'HR': 264}

💼 USAGE PATTERNS:
  • Average Usage Frequency: 3.43
  • Average Satisfaction: 2.96/5.0

🏆 TOP 5 BENEFITS BY USAGE:
  1. Monthly Communications: 5.26 avg usage
  2. HMO Family: 4.64 avg usage
  3. Dependent Coverage: 4.27 avg usage

⚠️  LOWEST USAGE BENEFITS (Awareness Opportunity):
  1. Healthcare FSA: 1.52 avg usage
  2. After-School Care: 1.71 avg usage
  3. On-Site Infant Care: 2.76 avg usage

🎯 DOMINANT CATEGORY: Cell Pho

In [105]:
# seg3

In [106]:
raff = ml_data.drop_duplicates(subset=['EmployeeID'], keep='first')
raff

Unnamed: 0,EmployeeID,UsageFrequency,Age,Tenure,BenefitCost,SatisfactionScore,subcat_401k Basic Matching,subcat_401k Catch-Up Contributions,subcat_401k High Contribution,subcat_401k Investment Fees,...,Gender_encoded,Department_encoded,age_group_encoded,tenure_group_encoded,BenefitType_encoded,BenefitSubType_encoded,usage_target,high_engagement_target,satisfaction_target,employee_segment
0,220,4,64,35,489.96,1,False,False,False,False,...,1,1,0,2,10,29,4,0,1,2
1,1820,1,53,2,519.66,2,False,False,False,False,...,1,0,1,1,4,10,1,0,2,2
2,285,2,64,35,84.55,3,False,False,False,False,...,1,3,0,2,5,12,2,0,3,5
3,4536,8,32,10,125.00,1,False,False,False,False,...,0,4,3,2,11,21,8,1,1,3
4,1262,3,42,1,824.53,3,False,False,False,False,...,1,0,1,1,10,11,3,0,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7614,332,1,27,1,915.48,3,False,False,False,False,...,1,3,3,1,1,18,1,0,3,3
7615,3102,1,54,27,509.65,2,False,True,False,False,...,1,3,1,2,8,1,1,0,2,5
7618,2519,8,36,14,475.00,1,False,False,False,False,...,1,2,3,2,1,6,8,1,1,3
7620,526,8,52,31,261.44,3,False,False,True,False,...,0,3,1,2,8,2,8,1,3,5


In [107]:
# def suggest_benefits(employee_id, top_n=3):
#     # Ensure employee exists
#     emp_rows = df[df['EmployeeID'] == employee_id]
#     if emp_rows.empty:
#         # fallback if employee not found
#         return

#     # Get employee's department and tenure group
#     emp_row = emp_rows.iloc[0]
#     dept = emp_row.get('Department')
#     tenure = emp_row.get('tenure_group')
#     age = emp_row.get('age_group')

#     # If any key attribute is missing, return fallback
#     if pd.isna(dept) or pd.isna(tenure) or pd.isna(age):
#         return

#     # Peer usage: Find top benefits used by peers in same department, tenure, and age group
#     peer_df = df[(df['Department'] == dept) &
#                  (df['tenure_group'] == tenure) &
#                  (df['age_group'] == age)]

#     if peer_df.empty:
#         return

#     peer_benefits = peer_df.groupby('BenefitSubType')['UsageFrequency']\
#                            .sum()\
#                            .sort_values(ascending=False)

#     # Exclude benefits already used by the employee
#     emp_benefits = set(emp_rows['BenefitSubType'].dropna())
#     suggestions = [b for b in peer_benefits.index if b not in emp_benefits][:top_n]

#     # # Fallback if fewer than top_n suggestions
#     # if len(suggestions) < top_n:
#     #     suggestions = []

#     return suggestions

In [108]:
# print(segment_top[0][0])

In [109]:
def suggest_benefits(employee_id, top_n=3):
    # Normalize type
    emp_rows = df[df['EmployeeID'].astype(str).str.strip() == str(employee_id).strip()]
    if emp_rows.empty:
        return []   # always return a list

    emp_row = emp_rows.iloc[0]
    dept   = emp_row.get('Department')
    tenure = emp_row.get('tenure_group')
    age    = emp_row.get('age_group')

    if pd.isna(dept) or pd.isna(tenure) or pd.isna(age):
        return []

    peer_df = df[
        (df['Department']   == dept) &
        (df['tenure_group'] == tenure) &
        (df['age_group']    == age)
    ]
    if peer_df.empty:
        return []

    peer_benefits = peer_df.groupby('BenefitSubType')['UsageFrequency'].sum().sort_values(ascending=False)
    emp_benefits = set(emp_rows['BenefitSubType'].dropna())
    suggestions = [b for b in peer_benefits.index if b not in emp_benefits]

    return suggestions[:top_n]

In [110]:
# test_employees = ml_data['EmployeeID'].drop_duplicates().sample(3, random_state=42)
# # for emp_id in test_employees:
    
# #     print(f'--- Recommendations for Employee {emp_id} ---')
# #     suggest_benefits(emp_id, top_n=3)
# #     print()


# results = []
# for emp in df['EmployeeID'].drop_duplicates():
#     recs = suggest_benefits(emp, top_n=3)
#     results.append({'EmployeeID': emp, 'recommendations': recs})

# rec_df = pd.DataFrame(results)    

In [111]:
segmentDF = df.drop_duplicates(subset=['EmployeeID'], keep='first')


In [112]:
# test1['rec1'] = ''
# test1['rec2'] = ''
# test1['rec3'] = ''

# test1.loc[test1['employee_segment'] == 0, 'top1'] = segment_top[0][0]
# test1.loc[test1['employee_segment'] == 0, 'top2'] = segment_top[0][1]
# test1.loc[test1['employee_segment'] == 0, 'top3'] = segment_top[0][2]

# test1.loc[test1['employee_segment'] == 1, 'top1'] = segment_top[1][0]
# test1.loc[test1['employee_segment'] == 1, 'top2'] = segment_top[1][1]
# test1.loc[test1['employee_segment'] == 1, 'top3'] = segment_top[1][2]

# test1.loc[test1['employee_segment'] == 2, 'top1'] = segment_top[2][0]
# test1.loc[test1['employee_segment'] == 2, 'top2'] = segment_top[2][1]
# test1.loc[test1['employee_segment'] == 2, 'top3'] = segment_top[2][2]

for seg_id, tops in enumerate(segment_top):
    segmentDF.loc[segmentDF['employee_segment'] == seg_id, ['top1', 'top2', 'top3']] = tops

for seg_id, bots in enumerate(segment_bot):
    segmentDF.loc[segmentDF['employee_segment'] == seg_id, ['bot1', 'bot2', 'bot3']] = bots

for seg_id, segrecs in enumerate(segment_bot):
    segmentDF.loc[segmentDF['employee_segment'] == seg_id, ['seg_rec1', 'seg_rec2', 'seg_rec3']] = segrecs



for emp in segmentDF['EmployeeID']:
    recs = suggest_benefits(emp, top_n=3)
    segmentDF.loc[segmentDF['EmployeeID'] == emp, 'rec1'] = recs[0]
    segmentDF.loc[segmentDF['EmployeeID'] == emp, 'rec2'] = recs[1]
    segmentDF.loc[segmentDF['EmployeeID'] == emp, 'rec3'] = recs[2]

    # test1.loc[test1['EmployeeID'] == emp, 'bot1'] = seg3[0]
    # test1.loc[test1['EmployeeID'] == emp, 'bot2'] = seg3[1]
    # test1.loc[test1['EmployeeID'] == emp, 'bot3'] = seg3[2]
    
    # results.append({'EmployeeID': emp, 'recommendations': recs})

# print(test1['EmployeeID'])
rec_df = pd.DataFrame(results)

segmentDF = segmentDF.drop_duplicates(subset=['EmployeeID'], keep='first')
segmentDF['employee_segment'] = raff['employee_segment']


In [113]:
segmentDF.columns

Index(['EmployeeID', 'BenefitID', 'UsageFrequency', 'LastUsedDate', 'Age',
       'Gender', 'Department', 'Tenure', 'BenefitType', 'BenefitSubType',
       'BenefitCost', 'SatisfactionScore', 'Comments', 'age_group',
       'tenure_group', 'subcat_401k Basic Matching',
       'subcat_401k Catch-Up Contributions', 'subcat_401k High Contribution',
       'subcat_401k Investment Fees', 'subcat_401k Maximum Matching',
       'subcat_401k Standard Matching', 'subcat_After-School Care',
       'subcat_Basic Coverage', 'subcat_Conference Attendance',
       'subcat_Dependent Coverage', 'subcat_Family Membership',
       'subcat_Graduate Degree', 'subcat_HDHP Individual', 'subcat_HMO Family',
       'subcat_Healthcare FSA', 'subcat_Individual Courses',
       'subcat_Monthly Communications', 'subcat_Monthly Internet Allowance',
       'subcat_On-Site Infant Care', 'subcat_PPO Family',
       'subcat_PPO Individual', 'subcat_Premium Discount Tier 1',
       'subcat_Professional Certification', 

In [114]:
segmentDF.head(5)

Unnamed: 0,EmployeeID,BenefitID,UsageFrequency,LastUsedDate,Age,Gender,Department,Tenure,BenefitType,BenefitSubType,...,top3,bot1,bot2,bot3,seg_rec1,seg_rec2,seg_rec3,rec1,rec2,rec3
0,220,20,4,2024-05-03,64,Male,HR,35,Tuition Reimbursement,Undergraduate Degree,...,Supplemental High Amount,On-Site Infant Care,Premium Discount Tier 1,PPO Individual,On-Site Infant Care,Premium Discount Tier 1,PPO Individual,Healthcare FSA,HMO Family,Transit Subsidy
1,1820,26,1,2024-02-08,53,Male,Finance,2,Gym Membership,Family Membership,...,Supplemental High Amount,On-Site Infant Care,Premium Discount Tier 1,PPO Individual,On-Site Infant Care,Premium Discount Tier 1,PPO Individual,Healthcare FSA,Monthly Communications,Tier 1 Partners
2,285,16,2,2023-10-27,64,Male,Marketing,35,Health Insurance,HDHP Individual,...,401k Maximum Matching,After-School Care,Dependent Coverage,On-Site Infant Care,After-School Care,Dependent Coverage,On-Site Infant Care,Basic Coverage,Supplemental Standard,PPO Individual
3,4536,8,8,2024-07-03,32,Female,Sales,10,Wellness Programs,Premium Discount Tier 1,...,401k High Contribution,401k Catch-Up Contributions,Professional Certification,PPO Family,401k Catch-Up Contributions,Professional Certification,PPO Family,401k High Contribution,Supplemental High Amount,401k Catch-Up Contributions
4,1262,12,3,2024-04-13,42,Male,Finance,1,Tuition Reimbursement,Graduate Degree,...,Supplemental High Amount,On-Site Infant Care,Premium Discount Tier 1,PPO Individual,On-Site Infant Care,Premium Discount Tier 1,PPO Individual,Healthcare FSA,Monthly Communications,Tier 1 Partners


In [None]:

# test1['top2'].unique()

array(['401k High Contribution', 'HMO Family', '401k Investment Fees',
       'Supplemental High Amount'], dtype=object)

In [85]:
test1.head(5)

Unnamed: 0,EmployeeID,BenefitID,UsageFrequency,LastUsedDate,Age,Gender,Department,Tenure,BenefitType,BenefitSubType,...,top3,bot1,bot2,bot3,seg1,seg2,seg3,seg_rec1,seg_rec2,seg_rec3
0,220,20,4,2024-05-03,64,Male,HR,35,Tuition Reimbursement,Undergraduate Degree,...,Supplemental High Amount,On-Site Infant Care,Premium Discount Tier 1,PPO Individual,,,,On-Site Infant Care,Premium Discount Tier 1,PPO Individual
1,1820,26,1,2024-02-08,53,Male,Finance,2,Gym Membership,Family Membership,...,Supplemental High Amount,On-Site Infant Care,Premium Discount Tier 1,PPO Individual,,,,On-Site Infant Care,Premium Discount Tier 1,PPO Individual
2,285,16,2,2023-10-27,64,Male,Marketing,35,Health Insurance,HDHP Individual,...,401k Maximum Matching,After-School Care,Dependent Coverage,On-Site Infant Care,,,,After-School Care,Dependent Coverage,On-Site Infant Care
3,4536,8,8,2024-07-03,32,Female,Sales,10,Wellness Programs,Premium Discount Tier 1,...,401k High Contribution,401k Catch-Up Contributions,Professional Certification,PPO Family,,,,401k Catch-Up Contributions,Professional Certification,PPO Family
4,1262,12,3,2024-04-13,42,Male,Finance,1,Tuition Reimbursement,Graduate Degree,...,Supplemental High Amount,On-Site Infant Care,Premium Discount Tier 1,PPO Individual,,,,On-Site Infant Care,Premium Discount Tier 1,PPO Individual


In [87]:
test1.columns

Index(['EmployeeID', 'BenefitID', 'UsageFrequency', 'LastUsedDate', 'Age',
       'Gender', 'Department', 'Tenure', 'BenefitType', 'BenefitSubType',
       'BenefitCost', 'SatisfactionScore', 'Comments', 'age_group',
       'tenure_group', 'subcat_401k Basic Matching',
       'subcat_401k Catch-Up Contributions', 'subcat_401k High Contribution',
       'subcat_401k Investment Fees', 'subcat_401k Maximum Matching',
       'subcat_401k Standard Matching', 'subcat_After-School Care',
       'subcat_Basic Coverage', 'subcat_Conference Attendance',
       'subcat_Dependent Coverage', 'subcat_Family Membership',
       'subcat_Graduate Degree', 'subcat_HDHP Individual', 'subcat_HMO Family',
       'subcat_Healthcare FSA', 'subcat_Individual Courses',
       'subcat_Monthly Communications', 'subcat_Monthly Internet Allowance',
       'subcat_On-Site Infant Care', 'subcat_PPO Family',
       'subcat_PPO Individual', 'subcat_Premium Discount Tier 1',
       'subcat_Professional Certification', 

In [None]:
# test2 = df.drop_duplicates(subset=['EmployeeID'], keep='first')

# # Initialize results list
# results = []

# for emp in test2['EmployeeID']:
#     recs = suggest_benefits(emp, top_n=3)
#     results.append({
#         'EmployeeID': emp,
#         'rec1': recs[0], 'rec2': recs[1], 'rec3': recs[2],
#         'top1': top3[0], 'top2': top3[1], 'top3': top3[2],
#         'bot1': bot3[0], 'bot2': bot3[1], 'bot3': bot3[2],
#         'seg1': seg3[0], 'seg2': seg3[1], 'seg3': seg3[2]
#     })

# # Convert results to DataFrame and merge
# rec_df = pd.DataFrame(results)
# test2 = test2.drop_duplicates(subset=['EmployeeID'], keep='first')
# test2 = test2.merge(rec_df, on='EmployeeID', how='left')

# # Add employee segment info
# test2['employee_segment'] = raff['employee_segment']

In [None]:
# print(test2['seg1'].unique())

['Supplemental Standard']


In [None]:
# check = test1.equals(test2)
# print(check)

False


In [100]:
#create new dataframe

#with rec1, rec2, rec3, top1, top2, top3, bot1, bot2, bot3 included


# results = []

# for emp in tes1:
#     print(emp)
#     # recs = suggest_benefits(emp, top_n=3)
#     # results.append({'EmployeeID': emp, 'recommendations': recs})

# rec_df = pd.DataFrame(results)    

In [116]:
segmentDF.to_csv('data/segment_data.csv', index=False)