# code to generate data


In [4]:
import random
import pandas as pd
import numpy as np

# Define age range for non-mentor candidates
min_age = 17
max_age = 50

# Mean and standard deviation for ages of mentor candidates
mentor_mean_age = 30
mentor_std_dev = 5  # Adjust as needed

# Define possible locations
locations = [
    "Buffalo, NY",
    "Amherst, NY",
    "Cheektowaga, NY",
    "Niagara Falls, NY",
    "Williamsville, NY",
    "Tonawanda, NY",
    "Lockport, NY",
    "West Seneca, NY",
    "Lancaster, NY",
    "Kenmore, NY",
]

# Define mappings between industries and interests
industry_interest_mappings = {
    "Technology": ["Software Development", "Data Science", "Product Management", "Data Analytics"],
    "Marketing": ["Digital Marketing", "Market Research", "Advertising", "Public Relations"],
    "Art & Design": ["Graphic Design", "Illustration", "Web Design", "Photography"],
    "Finance": ["Financial Analysis", "Investment Banking", "Accounting", "Financial Planning"],
}

# Define universities for matching locations
universities = {
    "Buffalo, NY": ["University at Buffalo", "Canisius College", "Buffalo State College"],
    "Amherst, NY": ["Daemen College"],
    "Niagara Falls, NY": ["Niagara University"],
    "Williamsville, NY": ["Medaille College"],
    "Cheektowaga, NY": ["University of Cheektowaga"],
    "Niagara Falls, NY": ["University of Niagra Falls"],
    "Williamsville, NY": ["University of Williamsville"],
    "Tonawanda, NY": ["University of Tonawanda"],
    "Lockport, NY": ["University of Lockport"],
    "West Seneca, NY": ["University of Seneca"],
    "Lancaster, NY": ["University of Lancaster"],
    "Kenmore, NY": ["University of Kenmore"],
}

# Define skill tags based on interests
skill_tags = {
    "Technology": ["Python", "Java", "Web Development", "Machine Learning"],
    "Marketing": ["Digital Marketing", "Social Media Advertising", "SEO", "Marketing Strategy"],
    "Art & Design": ["Graphic Design", "Illustration", "UI/UX Design", "Photography"],
    "Finance": ["Financial Analysis", "Investment Strategies", "Accounting Principles", "Financial Modeling"],
}

# Create an empty list to store generated data
data = []

# Generate 500 rows of data
for user_id in range(1, 501):
    is_mentor_candidate = random.choice([True, False])
    if is_mentor_candidate:
        startup_experience = random.randint(4, 5)
        age = int(np.random.normal(mentor_mean_age, mentor_std_dev))
        age = min(max_age, max(min_age, age))
    else:
        age = random.randint(min_age, max_age)
        startup_experience = random.randint(0, 5)
    gender = random.choice(["Male", "Female", "Non-binary"])
    location = random.choice(locations)
    industry = random.choice(list(industry_interest_mappings.keys()))
    interests = random.choice(industry_interest_mappings[industry])
    mentor = "Yes" if is_mentor_candidate else "No"
    university = random.choice(universities.get(location, []))  # Match location to universities
    availability_hours = random.randint(1, 3)  # Hours per week

    if interests:
        available_skills = skill_tags.get(interests, [])
        max_skills_to_sample = min(len(available_skills), random.randint(1, 3))
        skills = random.sample(available_skills, k=max_skills_to_sample)
    else:
        skills = []

    data.append([user_id, age, gender, location, industry, interests, startup_experience, mentor, university, availability_hours, skills])

df = pd.DataFrame(data, columns=["user_id", "age", "gender", "location", "industry", "interests", "startup_experience", "mentor", "university", "availability_hours", "skills"])

df.to_csv("users.csv", index=False)


In [38]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def apply_encodings(value, encoding_dict):
    return int(encoding_dict[value])

def myencode(data):
    encodings = dict()
    for col in data.columns:
        unique_vals = list(set(data[col].values))
        encoding_dict = {unique_val: i for i, unique_val in enumerate(unique_vals)}
        encodings[col] = encoding_dict
    for col, encoding_dict in encodings.items():
      data[col] = data[col].apply(lambda x: apply_encodings(x, encoding_dict))
    return data, encodings

data = pd.read_csv("users.csv")

numerical_attributes = ["startup_experience", "availability_hours"]
categorical_attributes = ["gender", "location", "industry", "university"]

attribute_weights = {
    "location": 3.0,
    "startup_experience": 2.0,
    "industry": 2.0,
    "university": 1.0,
    "gender": 1.0,
    "age": 1.0,
    "availability_hours": 2.0,
}
encoded_categorical_features, encodings = myencode(data[categorical_attributes])
mentors = data[data["mentor"] == "Yes"]
mentees = data[data["mentor"] == "No"]
mentor_features_numerical = mentors[numerical_attributes].values * np.array([attribute_weights[attr] for attr in numerical_attributes])
mentee_features_numerical = mentees[numerical_attributes].values * np.array([attribute_weights[attr] for attr in numerical_attributes])

mentor_features_categorical = encoded_categorical_features.iloc[mentors.index].astype(float).values
mentor_features_categorical *= np.array([attribute_weights[attr] for attr in categorical_attributes])
mentee_features_categorical = encoded_categorical_features.iloc[mentees.index].astype(float).values
mentee_features_categorical *= np.array([attribute_weights[attr] for attr in categorical_attributes])

mentor_features = np.hstack((mentor_features_numerical, mentor_features_categorical))
mentee_features = np.hstack((mentee_features_numerical, mentee_features_categorical))

similarity_matrix = cosine_similarity(mentee_features, mentor_features)

print(similarity_matrix)


[[0.55641657 0.54010803 0.61250537 ... 0.66178974 0.57196944 0.76244017]
 [0.77177168 0.89845282 0.78214295 ... 0.82714429 0.76870611 0.11986224]
 [0.81572594 0.71630472 0.80974924 ... 0.76145539 0.81158    0.83112671]
 ...
 [0.73900834 0.71439601 0.77584088 ... 0.66485468 0.57250116 0.70298972]
 [0.96188283 0.96720883 0.96721858 ... 0.9603306  0.96360922 0.51402323]
 [0.82618439 0.92599269 0.82403645 ... 0.81795853 0.77631944 0.16777798]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].apply(lambda x: apply_encodings(x, encoding_dict))


In [39]:
# inference
num_recommendations = 5
recommendations = {}

for i, mentee_index in enumerate(mentees.index):
    top_mentor_indices = similarity_matrix[i].argsort()[::-1][:num_recommendations]
    recommended_mentors = mentors.iloc[top_mentor_indices].index.tolist()
    recommendations[mentee_index] = recommended_mentors

for mentee_index, recommended_mentors in recommendations.items():
    mentee_name = data.loc[mentee_index, "user_id"]
    mentor_names = [data.loc[mentor_index, "user_id"] for mentor_index in recommended_mentors]
    print(f"Mentee {mentee_name} should consider mentors: {mentor_names}")

Mentee 5 should consider mentors: [330, 401, 225, 57, 324]
Mentee 7 should consider mentors: [450, 406, 446, 219, 60]
Mentee 8 should consider mentors: [476, 126, 32, 311, 98]
Mentee 10 should consider mentors: [397, 9, 154, 410, 298]
Mentee 12 should consider mentors: [69, 386, 59, 119, 248]
Mentee 13 should consider mentors: [456, 358, 54, 277, 63]
Mentee 14 should consider mentors: [54, 210, 456, 213, 201]
Mentee 16 should consider mentors: [54, 201, 213, 456, 163]
Mentee 19 should consider mentors: [458, 417, 231, 230, 222]
Mentee 20 should consider mentors: [350, 481, 408, 244, 418]
Mentee 22 should consider mentors: [450, 136, 326, 339, 331]
Mentee 23 should consider mentors: [387, 163, 216, 206, 146]
Mentee 24 should consider mentors: [330, 419, 324, 164, 486]
Mentee 25 should consider mentors: [57, 86, 238, 172, 401]
Mentee 26 should consider mentors: [319, 384, 303, 96, 32]
Mentee 28 should consider mentors: [82, 189, 149, 410, 154]
Mentee 29 should consider mentors: [418, 339