In [2]:
#Imports

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
#Loading the Data
users_df = pd.read_csv ("users.csv")

projects_df = pd.read_csv ("projects.csv")

In [4]:
users_df.head(5)

Unnamed: 0,first_name,last_name,email,interests
0,Bianca,Hill,bianca.hill@example.com,"['UI/UX Design', 'Android Mobile App Programmi..."
1,Mindy,Williams,mindy.williams@example.com,"['Machine Learning', 'Data Analysis', 'Web Dev..."
2,Jacob,Stanley,jacob.stanley@example.com,"['Java Programming', 'UI/UX Design', 'Data Ana..."
3,Alicia,Willis,alicia.willis@example.com,"['C++ Programming', 'Data Analysis', 'Java Pro..."
4,Cassandra,Stephens,cassandra.stephens@example.com,"['Android Mobile App Programming', 'Machine Le..."


In [27]:
projects_df.head(2)

Unnamed: 0,project_title,project_description,project_skills,project_budget,project_id,project_DOR,project_deadline
0,Build a UI/UX Design System,This project involves UI/UX Design and require...,"['UI/UX Design', 'Data Analysis']",375000,PLCS0001,2021-07-26,2021-05-05
1,Build a C++ Programming Project,This project involves C++ Programming and requ...,"['C++ Programming', 'Java Programming']",171000,PLCS0002,2021-10-20,2021-04-28


In [25]:
users_df.loc[0].interests

"['UI/UX Design', 'Android Mobile App Programming', 'C++ Programming']"

In [5]:
#Preprocessing the data we are interested in
user_interests = users_df['interests'].apply(eval).tolist()
project_skills = projects_df['project_skills'].apply(eval).tolist()

#each row is included in a list

In [8]:
user_interests[0:5]

[['UI/UX Design', 'Android Mobile App Programming', 'C++ Programming'],
 ['Machine Learning',
  'Data Analysis',
  'Web Development',
  'C++ Programming',
  'UI/UX Design'],
 ['Java Programming',
  'UI/UX Design',
  'Data Analysis',
  'C++ Programming',
  'Android Mobile App Programming'],
 ['C++ Programming',
  'Data Analysis',
  'Java Programming',
  'UI/UX Design',
  'Machine Learning',
  'Android Mobile App Programming',
  'Web Development'],
 ['Android Mobile App Programming',
  'Machine Learning',
  'Web Development',
  'Data Analysis',
  'UI/UX Design',
  'C++ Programming']]

In [9]:
#Create a list of unique interests and skills
all_interests = list(set([interest for interests in user_interests for interest in interests]))
all_skills = list(set([skill for skills in project_skills for skill in skills]))

In [11]:
all_interests[0:5]

['Data Analysis',
 'UI/UX Design',
 'Web Development',
 'Machine Learning',
 'Java Programming']

In [13]:
#Create numerical vectors for user interests and project skills required
user_vectors = np.zeros((len(user_interests), len(all_interests)))
for i, interests in enumerate(user_interests):
    for interest in interests:
        user_vectors[i, all_interests.index(interest)] = 1

In [14]:
user_vectors

array([[0., 1., 0., ..., 0., 1., 1.],
       [1., 1., 1., ..., 0., 1., 0.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [0., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.]])

In [15]:
user_vectors.size

7000

In [16]:
project_vectors = np.zeros((len(project_skills), len(all_skills)))
for i, skills in enumerate(project_skills):
    for skill in skills:
        project_vectors[i, all_skills.index(skill)] = 1

In [17]:
project_vectors

array([[1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 1., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 1.]])

In [36]:
#Calculate cosine similarity
similarity_matrix = cosine_similarity(user_vectors, project_vectors)

In [37]:
similarity_matrix

array([[0.40824829, 0.40824829, 0.40824829, ..., 0.40824829, 0.40824829,
        0.40824829],
       [0.63245553, 0.31622777, 0.        , ..., 0.31622777, 0.31622777,
        0.        ],
       [0.63245553, 0.63245553, 0.63245553, ..., 0.31622777, 0.31622777,
        0.63245553],
       ...,
       [0.31622777, 0.63245553, 0.63245553, ..., 0.31622777, 0.63245553,
        0.63245553],
       [0.57735027, 0.57735027, 0.28867513, ..., 0.28867513, 0.28867513,
        0.28867513],
       [0.40824829, 0.        , 0.40824829, ..., 0.40824829, 0.81649658,
        0.40824829]])

In [19]:
# Step 4: Sort projects based on cosine similarity scores
sorted_indices = np.argsort(similarity_matrix, axis=1)[:, ::-1]  # Descending order


In [20]:
sorted_indices

array([[423, 357, 286, ..., 347, 722, 499],
       [  0, 759, 309, ..., 791, 794, 999],
       [999, 542, 507, ..., 320, 319, 390],
       ...,
       [999, 508, 517, ..., 671, 673, 499],
       [  0, 270, 279, ...,  77, 906, 923],
       [624, 644, 919, ..., 717, 350, 499]], dtype=int64)

In [21]:
# Step 5: Return top 10 project recommendations for each user
num_recommendations = 10
rec_df = pd.DataFrame(columns=['email', 'recommended_projects'])

for i, user in enumerate(users_df['email']):
    user_recommendations = []
    for j in range(num_recommendations):
        project_index = sorted_indices[i, j]
        project_id = projects_df['project_id'].iloc[project_index]
        user_recommendations.append(project_id)

        #print("Recommendations for user", user, ":", user_recommendations)
    rec_df.loc[i] = [user, user_recommendations]

rec_df.loc[0]
#rec_df.to_csv('rec_cosSim.csv', index=False)

email                                             bianca.hill@example.com
recommended_projects    [PLCS0424, PLCS0358, PLCS0287, PLCS0147, PLCS0...
Name: 0, dtype: object

In [26]:
rec_projects = rec_df.loc[0]

In [35]:
rec_interests = []
for i in rec_projects:
    print (type(i))

<class 'str'>
<class 'list'>


In [None]:
#Calculating performance metrics
#Recall score

ground_truth = []  # Ground truth data for relevant projects
recall = recall_score(ground_truth, recommended_projects, average='micro', zero_division=1)
print("Recall:", recall)

Recall: 0.97
