# 1. Setup and Data Loading

## 1.1 Importing Dependencies

In [2]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.neighbors import NearestNeighbors
from annoy import AnnoyIndex
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense
from tensorflow.keras.models import Model

## 1.2 Creating Output Directory

In [2]:
# Create an output directory if it doesn't exist
output_dir = 'output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## 1.3 Loading the Datasets

In [3]:
# Load datasets
users_df = pd.read_csv('users.csv')  # Contains user metadata
interactions_df = pd.read_csv('interactions.csv')  # Contains user interactions
test_df = pd.read_csv('recommendations.csv')  # Contains Member_IDs for recommendations

# 2. Data Preprocessing

## 2.1 Handling Missing and Duplicate values

In [4]:
# Handle missing values (drop rows with missing data)
users_df.dropna(inplace=True)
interactions_df.dropna(inplace=True)
# Remove duplicates
interactions_df.drop_duplicates(inplace=True)

## 2.2 Encoding variables

In [5]:
# Encode categorical variables (e.g., Gender, Marital_Status, Sect, Caste, State)
categorical_columns = ['Gender', 'Marital_Status', 'Sect', 'Caste', 'State']
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    users_df[col] = le.fit_transform(users_df[col])
    label_encoders[col] = le  # Save encoders for later use (if needed)

# Normalize numerical features (e.g., Age)
scaler = MinMaxScaler()
users_df['Age'] = scaler.fit_transform(users_df[['Age']])

In [6]:
# Add an implicit interaction score (1 for all interactions)
interactions_df['Interaction_Score'] = 1

## 2.3 Creating User-Item Interaction Matrix 

In [7]:
# Create a user-item interaction matrix
user_ids = interactions_df['Member_ID'].unique()
item_ids = interactions_df['Target_ID'].unique()

# Map Member_ID and Target_ID to indices
user_id_map = {user_id: idx for idx, user_id in enumerate(user_ids)}
item_id_map = {item_id: idx for idx, item_id in enumerate(item_ids)}

# Create a sparse matrix for interactions
row = interactions_df['Member_ID'].map(user_id_map)
col = interactions_df['Target_ID'].map(item_id_map)
data = interactions_df['Interaction_Score']

interaction_matrix = csr_matrix((data, (row, col)), shape=(len(user_ids), len(item_ids)))

# 3. Collaborative Filtering (Matrix Factorization)

In [8]:
def collaborative_filtering(interaction_matrix, n_components=50):
    # Perform Singular Value Decomposition (SVD)
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    user_factors = svd.fit_transform(interaction_matrix)
    item_factors = svd.components_.T
    return user_factors, item_factors

In [9]:
# Perform collaborative filtering
user_factors, item_factors = collaborative_filtering(interaction_matrix)

# 4. Approximate Nearest Neighbors (ANN)

In [10]:
def build_ann_index(user_factors, n_trees=10):
    n_users, n_factors = user_factors.shape
    ann_index = AnnoyIndex(n_factors, 'angular')  # Angular distance for cosine similarity
    for i in range(n_users):
        ann_index.add_item(i, user_factors[i])
    ann_index.build(n_trees)
    return ann_index

In [11]:
# Build ANN index
ann_index = build_ann_index(user_factors)

# 5. Content-Based Filtering (Metadata Embeddings)

## 5.1 Building Neural Network to create Embeddings for User's Metadata

In [12]:
def build_metadata_embeddings(users_df, embedding_dim=16):
    inputs = []
    embeddings = []
    
    # Create embeddings for each categorical feature
    for col in categorical_columns:
        input_layer = Input(shape=(1,))
        vocab_size = users_df[col].nunique()
        embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_layer)
        flatten_layer = Flatten()(embedding_layer)
        inputs.append(input_layer)
        embeddings.append(flatten_layer)
    
    # Add numerical feature (Age)
    age_input = Input(shape=(1,))
    embeddings.append(age_input)
    inputs.append(age_input)
    
    # Concatenate all embeddings
    concatenated = Concatenate()(embeddings)
    dense_layer = Dense(64, activation='relu')(concatenated)
    output_layer = Dense(embedding_dim, activation='linear')(dense_layer)
    
    # Build model
    model = Model(inputs, output_layer)
    return model

## 5.2 Training the Metadata Embedding Model

In [13]:
# Train metadata embeddings
metadata_model = build_metadata_embeddings(users_df)
metadata_model.compile(optimizer='adam', loss='mse')

# Prepare metadata inputs for training
metadata_inputs = [users_df[col].values for col in categorical_columns] + [users_df['Age'].values]
metadata_model.fit(metadata_inputs, np.zeros((len(users_df), 16)), epochs=10, batch_size=64)

# Extract metadata embeddings
metadata_embeddings = metadata_model.predict(metadata_inputs)

Epoch 1/10
[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 9.4161e-06
Epoch 2/10
[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 6.8270e-08
Epoch 3/10
[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 3.1728e-08
Epoch 4/10
[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 1.7084e-08
Epoch 5/10
[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 8.7917e-09
Epoch 6/10
[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 8.7340e-09
Epoch 7/10
[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 4.6571e-10
Epoch 8/10
[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 7.6393e-10
Epoch 9/10
[1m4700/4700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 7.5588e-10
Epoch 10/10
[1m4700/4700[0m [32m━━

In [14]:
metadata_embeddings

array([[-4.5676948e-06,  8.1007889e-10, -1.6432686e-12, ...,
         2.5688538e-12, -2.8119444e-11,  2.5953586e-08],
       [-4.5676948e-06,  8.1007889e-10, -1.6432686e-12, ...,
         2.5688538e-12, -2.8119444e-11,  2.5953586e-08],
       [-4.5676948e-06,  8.1007889e-10, -1.6432686e-12, ...,
         2.5688538e-12, -2.8119444e-11,  2.5953586e-08],
       ...,
       [-4.5676948e-06,  8.1007889e-10, -1.6432686e-12, ...,
         2.5688538e-12, -2.8119444e-11,  2.5953586e-08],
       [-4.5676948e-06,  8.1007889e-10, -1.6432686e-12, ...,
         2.5688538e-12, -2.8119444e-11,  2.5953586e-08],
       [-4.5676948e-06,  8.1007889e-10, -1.6432686e-12, ...,
         2.5688538e-12, -2.8119444e-11,  2.5953586e-08]], dtype=float32)

# 6. Metadata-Based Recommendations

In [15]:
# Step 4: Metadata-Based Recommendations (Fallback)
def metadata_based_recommendations(user_id, users_df, opposite_gender, top_n=100):
    # Get opposite gender profiles
    opposite_profiles = users_df[users_df['Gender'] == opposite_gender]

    # Exclude past interactions (if any)
    past_interactions = interactions_df[(interactions_df['Member_ID'] == user_id) | (interactions_df['Target_ID'] == user_id)]
    interacted_users = set(past_interactions['Member_ID']).union(set(past_interactions['Target_ID']))
    opposite_profiles = opposite_profiles[~opposite_profiles['Member_ID'].isin(interacted_users)]

    # Fallback: Recommend profiles based on metadata similarity (e.g., age, caste, state)
    user_metadata = users_df[users_df['Member_ID'] == user_id].iloc[0]
    opposite_profiles['similarity_score'] = opposite_profiles.apply(
        lambda row: np.dot(
            row[['Age', 'Sect', 'Caste', 'State']].values,
            user_metadata[['Age', 'Sect', 'Caste', 'State']].values
        ), axis=1
    )

    # Sort by similarity score and select top N profiles
    top_n_profiles = opposite_profiles.sort_values(by='similarity_score', ascending=False).head(top_n)['Member_ID'].tolist()
    return top_n_profiles

# 7. Hybrid Recommendation

In [16]:
def hybrid_recommendation(user_id, ann_index, user_factors, metadata_embeddings, users_df, interactions_df, top_n=100):
    # Get opposite gender profiles
    user_gender = users_df.loc[users_df['Member_ID'] == user_id, 'Gender'].values[0]
    opposite_gender = 1 if user_gender == 0 else 0  # Assuming 0: Male, 1: Female
    opposite_profiles = users_df[users_df['Gender'] == opposite_gender]

    # Exclude past interactions
    past_interactions = interactions_df[(interactions_df['Member_ID'] == user_id) | (interactions_df['Target_ID'] == user_id)]
    interacted_users = set(past_interactions['Member_ID']).union(set(past_interactions['Target_ID']))
    opposite_profiles = opposite_profiles[~opposite_profiles['Member_ID'].isin(interacted_users)]

    # Get user index
    user_idx = user_id_map.get(user_id, -1)
    if user_idx == -1:
        # Fallback: Use metadata-based recommendations
        return metadata_based_recommendations(user_id, users_df, opposite_gender, top_n)
    
    # Find approximate nearest neighbors (collaborative filtering)
    similar_users = ann_index.get_nns_by_item(user_idx, top_n * 10)  # Fetch more to filter later (AA keep in mind)
    similar_users = [u for u in similar_users if u in opposite_profiles.index]

    # Combine collaborative and content-based scores
    user_metadata_embedding = metadata_embeddings[user_idx]
    content_scores = np.dot(metadata_embeddings[similar_users], user_metadata_embedding)
    collab_scores = [1 / (1 + i) for i in range(len(similar_users))]  # Higher score for closer neighbors
    hybrid_scores = 0.7 * np.array(collab_scores) + 0.3 * content_scores  # Weighted combination

    # Select top N profiles
    top_n_indices = np.argsort(hybrid_scores)[-top_n:][::-1]
    top_n_profiles = opposite_profiles.iloc[top_n_indices]['Member_ID'].tolist()
    return top_n_profiles

# 8. Generating Recommendations

In [17]:
def generate_test_recommendations(test_df, ann_index, user_factors, metadata_embeddings, users_df, interactions_df):
    recommendations = []
    for user_id in test_df['Member_ID']:
        top_100_profiles = hybrid_recommendation(user_id, ann_index, user_factors, metadata_embeddings, users_df, interactions_df, top_n=100)
        if not top_100_profiles:  # Fallback: Use metadata-based recommendations
            print(f"No recommendations found for user {user_id}. Using fallback.")
            top_100_profiles = metadata_based_recommendations(user_id, users_df, opposite_gender=1 if users_df.loc[users_df['Member_ID'] == user_id, 'Gender'].values[0] == 0 else 0)
        recommendations.append({'Member_ID': user_id, 'top_100_profiles': ','.join(map(str, top_100_profiles))})
    return pd.DataFrame(recommendations)

In [18]:
# Generate recommendations for test data
recommendations_df = generate_test_recommendations(test_df, ann_index, user_factors, metadata_embeddings, users_df, interactions_df)

# 9. Saving Recommendations

In [19]:
# Save recommendations to CSV in the output directory
output_file_path = os.path.join(output_dir, 'recommendations.csv')
recommendations_df.to_csv(output_file_path, index=False)

# 10. Showing the Output of the Recommendations

In [3]:
data1 = pd.read_csv('output/recommendations.csv')

In [4]:
data1.head()

Unnamed: 0,Member_ID,top_100_profiles
0,137544,"10,30,246,538,694,725,820,994,1090,1158,1241,1..."
1,16569,"7,38,63,86,121,163,190,248,283,293,316,429,478..."
2,439283,"10,30,246,538,694,725,820,994,1090,1158,1241,1..."
3,278297,"10,30,246,538,694,725,820,994,1090,1158,1241,1..."
4,121606,"6825,258496,443321,287,343239,257198,350520,47..."


In [6]:
for i in range(100):
    profile_list = data1['top_100_profiles'][i].split(',')
    print("Number of profiles:", len(profile_list))

Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profiles: 100
Number of profil

In [1]:
import pandas as pd

# Load the recommendations.csv file
recommendations_df = pd.read_csv('output/recommendations.csv')

# Load the users.csv file
users_df = pd.read_csv('users.csv')

# Create a dictionary to map Member_ID to their metadata
user_metadata_dict = users_df.set_index('Member_ID').to_dict(orient='index')

# Function to get metadata for top_100_profiles
def get_top_profiles_metadata(row):
    member_id = row['Member_ID']
    top_100_profiles = row['top_100_profiles'].split(',')  # Split the string into a list of profile IDs
    top_100_profiles = [int(profile_id) for profile_id in top_100_profiles]  # Convert to integers
    
    # Fetch metadata for each profile in top_100_profiles
    profiles_metadata = []
    for profile_id in top_100_profiles:
        if profile_id in user_metadata_dict:
            metadata = user_metadata_dict[profile_id]
            profiles_metadata.append({
                'Profile_ID': profile_id,
                'Gender': metadata['Gender'],
                'Caste': metadata['Caste'],
                'Sect': metadata['Sect'],
                'State': metadata['State']
            })
        else:
            # Handle cases where profile_id is not found in users.csv
            profiles_metadata.append({
                'Profile_ID': profile_id,
                'Gender': 'Unknown',
                'Caste': 'Unknown',
                'Sect': 'Unknown',
                'State': 'Unknown'
            })
    
    return profiles_metadata

# Apply the function to each row in recommendations_df
recommendations_df['top_100_profiles_metadata'] = recommendations_df.apply(get_top_profiles_metadata, axis=1)

# Explode the list of metadata into separate rows
exploded_df = recommendations_df.explode('top_100_profiles_metadata')

# Reset the index of exploded_df to avoid duplicate indices
exploded_df = exploded_df.reset_index(drop=True)

# Normalize the metadata dictionary into separate columns
metadata_df = pd.json_normalize(exploded_df['top_100_profiles_metadata'])

# Combine the metadata with the original recommendations_df
final_df = pd.concat([exploded_df.drop(columns=['top_100_profiles_metadata']), metadata_df], axis=1)

# Display the final result
print(final_df)

       Member_ID                                   top_100_profiles  \
0         137544  10,30,246,538,694,725,820,994,1090,1158,1241,1...   
1         137544  10,30,246,538,694,725,820,994,1090,1158,1241,1...   
2         137544  10,30,246,538,694,725,820,994,1090,1158,1241,1...   
3         137544  10,30,246,538,694,725,820,994,1090,1158,1241,1...   
4         137544  10,30,246,538,694,725,820,994,1090,1158,1241,1...   
...          ...                                                ...   
29813      83917  7,38,63,86,121,163,190,248,283,293,316,429,478...   
29814      83917  7,38,63,86,121,163,190,248,283,293,316,429,478...   
29815      83917  7,38,63,86,121,163,190,248,283,293,316,429,478...   
29816      83917  7,38,63,86,121,163,190,248,283,293,316,429,478...   
29817      83917  7,38,63,86,121,163,190,248,283,293,316,429,478...   

       Profile_ID  Gender             Caste   Sect          State  
0              10  Female  Sheikh or Shaikh  Sunni          Bihar  
1          

In [2]:
final_df.head()

Unnamed: 0,Member_ID,top_100_profiles,Profile_ID,Gender,Caste,Sect,State
0,137544,"10,30,246,538,694,725,820,994,1090,1158,1241,1...",10,Female,Sheikh or Shaikh,Sunni,Bihar
1,137544,"10,30,246,538,694,725,820,994,1090,1158,1241,1...",30,Female,Other,Sunni,Delhi
2,137544,"10,30,246,538,694,725,820,994,1090,1158,1241,1...",246,Female,Other,Sunni,Uttar Pradesh
3,137544,"10,30,246,538,694,725,820,994,1090,1158,1241,1...",538,Female,Sheikh or Shaikh,Sunni,Delhi
4,137544,"10,30,246,538,694,725,820,994,1090,1158,1241,1...",694,Female,Khan or Pathan,Sunni,Uttar Pradesh


In [1]:
import pandas as pd

# Load the recommendations.csv file
recommendations_df = pd.read_csv('output/recommendations.csv')

# Load the users.csv file
users_df = pd.read_csv('users.csv')

# Create a dictionary to map Member_ID to their metadata
user_metadata_dict = users_df.set_index('Member_ID').to_dict(orient='index')

# Initialize the final nested dictionary
nested_dict = {}

# Function to get metadata for top_100_profiles
def get_top_profiles_metadata(row):
    member_id = row['Member_ID']
    top_100_profiles = row['top_100_profiles'].split(',')  # Split the string into a list of profile IDs
    top_100_profiles = [int(profile_id) for profile_id in top_100_profiles]  # Convert to integers
    
    # Fetch metadata for each profile in top_100_profiles
    profiles_metadata = {}
    for profile_id in top_100_profiles:
        if profile_id in user_metadata_dict:
            metadata = user_metadata_dict[profile_id]
            profiles_metadata[profile_id] = {
                'Age': metadata['Age'],
                'Gender': metadata['Gender'],
                'Caste': metadata['Caste'],
                'Sect': metadata['Sect'],
                'State': metadata['State']
            }
        else:
            # Handle cases where profile_id is not found in users.csv
            profiles_metadata[profile_id] = {
                'Age': 'Unknown',
                'Gender': 'Unknown',
                'Caste': 'Unknown',
                'Sect': 'Unknown',
                'State': 'Unknown'
            }
    
    return profiles_metadata

# Populate the nested dictionary
for _, row in recommendations_df.iterrows():
    member_id = row['Member_ID']
    nested_dict[member_id] = get_top_profiles_metadata(row)

# Display the nested dictionary
for member_id, profiles in nested_dict.items():
    print(f"Member_ID: {member_id}")
    for profile_id, details in profiles.items():
        print(f"  Profile_ID: {profile_id}, Details: {details}")
    print()

# Save the nested dictionary to a JSON file (optional)
import json
with open('output1/nested_dict.json', 'w') as f:
    json.dump(nested_dict, f, indent=4)

Member_ID: 137544
  Profile_ID: 10, Details: {'Age': 25, 'Gender': 'Female', 'Caste': 'Sheikh or Shaikh', 'Sect': 'Sunni', 'State': 'Bihar'}
  Profile_ID: 30, Details: {'Age': 36, 'Gender': 'Female', 'Caste': 'Other', 'Sect': 'Sunni', 'State': 'Delhi'}
  Profile_ID: 246, Details: {'Age': 44, 'Gender': 'Female', 'Caste': 'Other', 'Sect': 'Sunni', 'State': 'Uttar Pradesh'}
  Profile_ID: 538, Details: {'Age': 33, 'Gender': 'Female', 'Caste': 'Sheikh or Shaikh', 'Sect': 'Sunni', 'State': 'Delhi'}
  Profile_ID: 694, Details: {'Age': 34, 'Gender': 'Female', 'Caste': 'Khan or Pathan', 'Sect': 'Sunni', 'State': 'Uttar Pradesh'}
  Profile_ID: 725, Details: {'Age': 32, 'Gender': 'Female', 'Caste': 'Other', 'Sect': 'Shia', 'State': 'Uttar Pradesh'}
  Profile_ID: 820, Details: {'Age': 33, 'Gender': 'Female', 'Caste': 'Sheikh or Shaikh', 'Sect': 'Sunni', 'State': 'Uttar Pradesh'}
  Profile_ID: 994, Details: {'Age': 40, 'Gender': 'Female', 'Caste': 'Khan or Pathan', 'Sect': 'Sunni', 'State': 'Uttar 

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

