USER - USER CF - we can apply covarriance correlation function

In [1]:
import pandas as pd

# Load user data into a pandas DataFrame
user_data = pd.read_csv("user_user_with_toefl_score.csv")

# Apply one-hot encoding on 'univName' and 'major' columns
one_hot_univ = pd.get_dummies(user_data['univName'])
one_hot_major = pd.get_dummies(user_data['major'])

# Concatenate the original DataFrame with the one-hot encoded DataFrames
user_data_encoded = pd.concat([user_data, one_hot_univ, one_hot_major], axis=1)

# Define aggregation functions for grouping by 'userName'
agg_funcs = {col: 'max' for col in one_hot_univ.columns}  # Take max for one-hot encoded university columns
agg_funcs.update({col: 'max' for col in one_hot_major.columns})  # Update for one-hot encoded major columns
for col in user_data.columns:
    if col not in ['userName'] and col not in one_hot_univ.columns and col not in one_hot_major.columns:
        agg_funcs[col] = 'first'  # Take the first value for other non-encoded columns

# Group by 'userName' and apply aggregation functions
user_combined_data = user_data_encoded.groupby('userName').agg(agg_funcs).reset_index()

# Print the combined DataFrame
print(user_combined_data)


            userName  Arizona State University  \
0         ! Superman                         0   
1            !mpulse                         1   
2               $4R4                         1   
3             $aumil                         0   
4            (:)_(:)                         0   
...              ...                       ...   
13384       zulu.sud                         0   
13385       zuperman                         0   
13386    zzkaustavzz                         0   
13387          zztop                         1   
13388  ~rattlesnake~                         0   

       California Institute of Technology  Carnegie Mellon University  \
0                                       1                           0   
1                                       0                           0   
2                                       0                           0   
3                                       0                           1   
4                                 

In [2]:
print(user_combined_data.columns)

Index(['userName', 'Arizona State University',
       'California Institute of Technology', 'Carnegie Mellon University',
       'Clemson University', 'Columbia University', 'Cornell University',
       'George Mason University', 'Georgia Institute of Technology',
       'Harvard University',
       ...
       'toeflScore', 'toeflEssay', 'internExp', 'greV', 'greQ', 'journalPubs',
       'confPubs', 'cgpa', 'univName', 'admit'],
      dtype='object', length=112)


In [3]:
user_data_new = user_combined_data.drop(columns=['univName', 'admit', 'major'])
print(user_data_new[user_data_new['userName'] == "aditya57"])

      userName  Arizona State University  California Institute of Technology  \
2281  aditya57                         0                                   0   

      Carnegie Mellon University  Clemson University  Columbia University  \
2281                           0                   0                    0   

      Cornell University  George Mason University  \
2281                   0                        0   

      Georgia Institute of Technology  Harvard University  ...  researchExp  \
2281                                0                   0  ...            0   

      industryExp  toeflScore  toeflEssay  internExp   greV   greQ  \
2281           35       110.0        28.0        3.0  163.0  166.0   

      journalPubs  confPubs   cgpa  
2281            0         0  0.823  

[1 rows x 109 columns]


In [4]:
import numpy as np
user_data_new.fillna(0, inplace=True)  # Replace NaN values with 0
user_data_new.replace([np.inf, -np.inf], 0, inplace=True)  # Replace infinite values with 0


In [5]:
print(user_data_new[user_data_new['userName'] == '143saf'].values.flatten())
print(user_data_new[user_data_new['userName'] == 'AB25'].values.flatten())

['143saf' 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 18 112.0 26.0
 5.0 160.0 167.0 0 0 0.85]
['AB25' 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 66 94.0 21.0
 0.0 146.0 157.0 0 0 0.7828]


In [10]:
user_data_new['userid'] = [i for i, _ in enumerate(user_data_new['userName'], start=1)]
print(user_data_new[user_data_new['userName'] == '143saf'].values.flatten())
print(user_data_new[user_data_new['userName'] == 'AB25'].values.flatten())

['143saf' 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 18 112.0 26.0
 5.0 160.0 167.0 0 0 0.85 26]
['AB25' 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 66 94.0 21.0
 0.0 146.0 157.0 0 0 0.7828 60]


In [11]:

# for user1 in user_data_new['userName']:
#     for user2 in user_data_new['userName']:
# #         print(data_user1.shape, data_user2.shape,  user1)
#         if user1 != user2:
# #             print(user1, user2, user_data_new[user_data_new['userName']== user1].shape,user_data_new[user_data_new['userName']== user2].shape )
# #             break
#             data_user1 = user_data_new.loc[user_data_new['userName'] == user1, user_data_new.columns != 'userName'].values.flatten()
#             data_user2 = user_data_new.loc[user_data_new['userName'] == user2, user_data_new.columns != 'userName'].values.flatten()
# #             print(data_user1, user1, data_user2)

#             correlation_coefficient, _ = pearsonr(data_user1, data_user2)
#             correlation_matrix.loc[user1, user2] = correlation_coefficient

# # Filter out NaN values (non-matching users) from the correlation matrix
# correlation_matrix = correlation_matrix.dropna()

user_data_without_name = user_data_new.drop('userName', axis=1)

# Calculate the correlation matrix using np.corrcoef()
correlation_matrix = np.corrcoef(user_data_without_name, rowvar=False)

# Display or further process the correlation matrix
print(correlation_matrix)


[[ 1.00000000e+00 -2.63882639e-02 -7.65335016e-02 ...  4.34292786e-02
   3.93741356e-03  3.92448634e-03]
 [-2.63882639e-02  1.00000000e+00 -4.15912394e-03 ...  5.44762246e-03
   4.02962477e-04 -4.42586470e-04]
 [-7.65335016e-02 -4.15912394e-03  1.00000000e+00 ...  6.06010026e-03
   8.36839497e-03 -5.04369540e-04]
 ...
 [ 4.34292786e-02  5.44762246e-03  6.06010026e-03 ...  1.00000000e+00
   9.54172715e-03  1.10650037e-02]
 [ 3.93741356e-03  4.02962477e-04  8.36839497e-03 ...  9.54172715e-03
   1.00000000e+00  8.51191053e-03]
 [ 3.92448634e-03 -4.42586470e-04 -5.04369540e-04 ...  1.10650037e-02
   8.51191053e-03  1.00000000e+00]]


In [14]:
# Perform user-user collaborative filtering for a target user
target_user_id = user_data_new.loc[user_data_new['userName'] == '143saf', 'userid'].iloc[0]
print("User ID for '143saf':", target_user_id)
similar_users = correlation_matrix[target_user_id]
print("Similar users for", target_user, ":", similar_users)


User ID for '143saf': 26
Similar users for 143saf : [ 1.14912691e-03  4.00494623e-02  1.12636793e-02 -1.15847357e-02
  1.63447780e-02  3.84285191e-02 -1.59087613e-02  2.62527289e-02
 -3.36987151e-03 -3.60099570e-03  1.36417260e-02 -1.73242915e-02
  1.63447780e-02  1.00293128e-02  5.26776262e-04  2.11627481e-02
  3.83245031e-02  7.63312030e-03  3.27635206e-02  2.84731964e-02
 -9.74422516e-03  4.26052391e-02  4.34708230e-02 -7.11289171e-03
  1.28076648e-02 -4.85665985e-03  1.00000000e+00  8.59971494e-02
  6.25543593e-02  6.49526773e-02  4.94947942e-02  3.83233220e-02
 -2.04474788e-03  6.66645558e-03  9.78725969e-03  2.66061022e-03
  2.39867549e-02  8.15969739e-03  2.87894439e-02  1.48818197e-02
  1.40985085e-02  4.13849945e-02 -1.73006092e-02  2.22865227e-02
  2.01112456e-02 -1.68276737e-02  2.31198915e-02 -8.05314906e-04
  1.57482471e-04  2.01719455e-02  9.15539696e-03  2.90383753e-02
 -9.97222748e-03 -9.09442665e-03 -7.31426759e-04 -4.70205137e-03
  2.39731510e-02  9.11099364e-03  3.12

In [25]:
top_k_indices = np.argsort(similar_users)[::-1][:5]
top_k_users = [user_data_new.iloc[i]['userName'] for i in top_k_indices]
print(top_k_users)

['1987frank', '198921', '19arjun89', '1990', 'ARK']


In [27]:
pip install sentence-transformers

Collecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/76/2c/bd95032aeb087b0706596af0a4518c4bfe0439a1bb149048ece18b617766/sentence_transformers-2.7.0-py3-none-any.whl.metadata
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Obtaining dependency information for transformers<5.0.0,>=4.34.0 from https://files.pythonhosted.org/packages/09/c8/844d5518a6aeb4ffdc0cf0cae65ae13dbe5838306728c5c640b5a6e2a0c9/transformers-4.40.0-py3-none-any.whl.metadata
  Downloading transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting huggingface-hub>=0.15.1 (from sentence-transformers)
  Obtaining dependency information for huggingface-hub>=0.15.1 from https://files.pythonhosted.org/

In [31]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load BERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Sample dataset (replace this with your actual dataset)
# data = {
#     'userName': ['143saf', 'AB25', 'abhijitgang', 'agteakash', 'alankarMIS'],
#     'major': ['Robotics', 'MIS', 'MIS', 'MIS', 'MIS'],
#     'researchExp': [0, 0, 0, 0, 0],
#     'industryExp': [18, 66, 0, 0, 0],
#     'internExp': [5.0, 0.0, 0.0, 0.0, 0.0],
#     'greV': [160.0, 146.0, 89.25, 150.0, 147.0],
#     'greQ': [167.0, 157.0, 163.625, 161.0, 156.0],
#     'journalPubs': [0, 0, 0, 0, 0],
#     'confPubs': [0, 0, 0, 0, 0],
#     'cgpa': [0.85, 0.7828, 0.57, 0.622, 0.52],
#     'univName': ['MIT', 'Stanford', 'Harvard', 'Caltech', 'Carnegie Mellon'],
#     'admit': [1, 1, 1, 1, 1]
# }

data = pd.read_csv('/Users/shwetimasakshi/Desktop/user_user_wo_toefl_score.csv')
df = pd.DataFrame(data)
print('data loaded')

# Function to generate BERT embeddings for textual data
def generate_bert_embeddings(texts):
    return model.encode(texts)

# Calculate BERT embeddings for applicant profiles
applicant_profiles = generate_bert_embeddings(df['userName'])
print('applicant_profiles')
# Calculate similarity matrix between applicant profiles
similarity_matrix = cosine_similarity(applicant_profiles)

# Recommend universities for each applicant based on similar applicants' admissions
recommendations = {}
for i, user in enumerate(df['userName']):
    similar_users_indices = np.argsort(similarity_matrix[i])[::-1][1:]  # Exclude self
    similar_users_admissions = df.iloc[similar_users_indices]['univName'].unique()
    recommendations[user] = similar_users_admissions

# Print recommendations for each applicant
for user, univs in recommendations.items():
    print(f"Recommendations for {user}: {', '.join(univs)}")


data loaded
applicant_profiles


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

