In [17]:
#Imports
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
# Step 1: Load data from Users.csv and Projects.csv

# Load user data
users = []
with open('Users.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip the header row
    for row in reader:
        email = row[0]
        skills = row[1]
        users.append((email, skills))

In [19]:
users

[('user245@example.com', "['C++ Programming']"),
 ('user904@example.com',
  "['C++ Programming', 'UI/UX', 'Algorithm', 'Software Engineering']"),
 ('user119@example.com',
  "['Web Application Development', 'UI/UX', 'C++ Programming', 'Algorithm', 'Software Engineering', 'Data Structures']"),
 ('user506@example.com', "['Algorithm']"),
 ('user678@example.com',
  "['Algorithm', 'UI/UX', 'Software Engineering', 'C++ Programming', 'Web Application Development', 'Data Structures']"),
 ('user851@example.com',
  "['UI/UX', 'Software Engineering', 'C++ Programming', 'Web Application Development']"),
 ('user798@example.com', "['Web Application Development', 'UI/UX']"),
 ('user738@example.com',
  "['Algorithm', 'UI/UX', 'Software Engineering', 'Data Structures', 'Web Application Development', 'C++ Programming']"),
 ('user976@example.com',
  "['C++ Programming', 'UI/UX', 'Algorithm', 'Data Structures', 'Web Application Development']"),
 ('user869@example.com', "['Web Application Development', 'Alg

In [20]:
# Load project data
projects = []
with open('Projects.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip the header row
    for row in reader:
        project_id = row[0]
        skills = row[1]
        projects.append((project_id, skills))


In [21]:
projects

[('project-1',
  "['C++ Programming', 'Software Engineering', 'Web Application Development', 'Data Structures', 'Algorithm', 'UI/UX']"),
 ('project-2', "['C++ Programming', 'Algorithm']"),
 ('project-3', "['UI/UX', 'C++ Programming']"),
 ('project-4', "['C++ Programming']"),
 ('project-5',
  "['Algorithm', 'UI/UX', 'Software Engineering', 'C++ Programming', 'Web Application Development']"),
 ('project-6',
  "['Data Structures', 'C++ Programming', 'Algorithm', 'Software Engineering', 'UI/UX', 'Web Application Development']"),
 ('project-7', "['C++ Programming']"),
 ('project-8', "['Data Structures', 'Web Application Development']"),
 ('project-9', "['Software Engineering']"),
 ('project-10', "['Web Application Development', 'C++ Programming']"),
 ('project-11', "['Algorithm', 'C++ Programming']"),
 ('project-12', "['Data Structures']"),
 ('project-13', "['Web Application Development', 'Software Engineering']"),
 ('project-14', "['C++ Programming', 'Web Application Development']"),
 ('pr

In [22]:
# Step 2: Preprocess data and compute cosine similarity matrix

# Preprocess skills for CountVectorizer
all_skills = [user[1] for user in users] + [project[1] for project in projects]


In [23]:
all_skills

["['C++ Programming']",
 "['C++ Programming', 'UI/UX', 'Algorithm', 'Software Engineering']",
 "['Web Application Development', 'UI/UX', 'C++ Programming', 'Algorithm', 'Software Engineering', 'Data Structures']",
 "['Algorithm']",
 "['Algorithm', 'UI/UX', 'Software Engineering', 'C++ Programming', 'Web Application Development', 'Data Structures']",
 "['UI/UX', 'Software Engineering', 'C++ Programming', 'Web Application Development']",
 "['Web Application Development', 'UI/UX']",
 "['Algorithm', 'UI/UX', 'Software Engineering', 'Data Structures', 'Web Application Development', 'C++ Programming']",
 "['C++ Programming', 'UI/UX', 'Algorithm', 'Data Structures', 'Web Application Development']",
 "['Web Application Development', 'Algorithm']",
 "['Software Engineering']",
 "['Web Application Development', 'C++ Programming', 'Software Engineering', 'Algorithm']",
 "['UI/UX', 'Web Application Development']",
 "['Data Structures', 'UI/UX', 'Web Application Development', 'C++ Programming', 'So

In [24]:
# Create CountVectorizer and fit-transform the skills
vectorizer = CountVectorizer()
skills_matrix = vectorizer.fit_transform(all_skills)

In [25]:
# Compute cosine similarity matrix
user_skills = skills_matrix[:len(users)]
project_skills = skills_matrix[len(users):]
cosine_sim_matrix = cosine_similarity(user_skills, project_skills)

In [26]:
user_skills

<100x11 sparse matrix of type '<class 'numpy.int64'>'
	with 620 stored elements in Compressed Sparse Row format>

In [27]:
project_skills

<100x11 sparse matrix of type '<class 'numpy.int64'>'
	with 590 stored elements in Compressed Sparse Row format>

In [28]:
cosine_sim_matrix

array([[0.30151134, 0.70710678, 0.57735027, ..., 0.        , 0.37796447,
        0.        ],
       [0.73854895, 0.57735027, 0.70710678, ..., 0.4330127 , 0.6172134 ,
        0.54772256],
       [1.        , 0.42640143, 0.52223297, ..., 0.85280287, 0.79772404,
        0.67419986],
       ...,
       [0.60302269, 0.70710678, 0.28867513, ..., 0.53033009, 0.37796447,
        0.67082039],
       [1.        , 0.42640143, 0.52223297, ..., 0.85280287, 0.79772404,
        0.67419986],
       [0.52223297, 0.        , 0.        , ..., 0.61237244, 0.65465367,
        0.        ]])

In [29]:
# Step 3: Generate recommendations and write to Cosine_sim.csv

# Write recommendations to Cosine_sim.csv
with open('Cosine_sim.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Email', 'Recommended Projects'])

    for user_index, user in enumerate(users):
        email = user[0]

        # Get cosine similarity scores for the user
        user_sim_scores = cosine_sim_matrix[user_index]

        # Sort projects based on similarity scores
        sorted_indices = user_sim_scores.argsort()[::-1]

        # Get top 15 recommended projects
        num_projects = min(15, len(projects))
        recommended_projects = [projects[i][0] for i in sorted_indices[:num_projects]]

        writer.writerow([email, recommended_projects])