In [None]:
#The code presented in this section is the scoring algorithm that is used to find a percentage of similarity
#between the new profiles and the database by vectorizing the descriptions of the profiles and using the
#method of cosine similarity.

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Read the profiles database from an Excel file
profiles_df = pd.read_csv('database-copia.csv', encoding='latin1')

# Read the new profile from an Excel file
new_profiles_df = pd.read_csv('testing-copia.csv', encoding='latin1')

# Preprocess the data, experience and education separately
profiles_exp = profiles_df.iloc[:, 3].tolist()
profiles_edu = profiles_df.iloc[:, 4].tolist()
new_profiles_exp = new_profiles_df.iloc[:, 3].tolist()
new_profiles_edu = new_profiles_df.iloc[:, 4].tolist()

vectorizer = TfidfVectorizer()
results_df = pd.DataFrame(columns=['New Profile', 'Similarity(%)'])

for i, new_profile_exp in enumerate(new_profiles_exp):
    new_profile_edu = new_profiles_edu[i]
    all_profiles_exp = profiles_exp + [new_profile_exp]
    all_profiles_edu = profiles_edu + [new_profile_edu]

    # Apply TF-IDF vectorization with the same vocabulary
    vectorizer = TfidfVectorizer()
    tfidf_matrix_exp = vectorizer.fit_transform(all_profiles_exp)
    tfidf_matrix_edu = vectorizer.fit_transform(all_profiles_edu)

    # Extract the vector representation of the new profile
    new_profile_vector_exp = tfidf_matrix_exp[-1]
    new_profile_vector_edu = tfidf_matrix_edu[-1]

    # Calculate cosine similarity between the new profile and the entire database
    similarity_scores_exp = cosine_similarity(new_profile_vector_exp.reshape(1, -1), tfidf_matrix_exp[:-1])
    similarity_scores_edu = cosine_similarity(new_profile_vector_edu.reshape(1, -1), tfidf_matrix_edu[:-1])
    similarity_percentage_exp = similarity_scores_exp.mean() * 100
    similarity_percentage_edu = similarity_scores_edu.mean() * 100
    similarity_percentage = (similarity_percentage_exp + similarity_percentage_edu) / 2

    # Combine both scores and add to results
    results_df.loc[i] = [new_profiles_df.iloc[i, 0], similarity_percentage]

# Sort the DataFrame by similarity percentage in descending order
results_df = results_df.sort_values(by='Similarity(%)', ascending=False)

# Print the similarity results
print(results_df)
