In [2]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler


df=pd.read_csv("exported.csv")
df.head()

Unnamed: 0,counter,short_url,country,region,total_enrollment,total_international_enrollment,ug_enrollment,international_ug_enrollment,pg_enrollment,international_pg_enrollment,rating
0,1,harvard-university,United States,North America,41987,24.0,14467,10.1,27520,20.1,5.0
1,2,stanford-university,United States,North America,16163,24.0,6994,10.7,9169,34.2,5.0
2,3,massachusetts-institute-of-technology-mit,United States,North America,11084,16.8,4276,14.5,6808,18.2,5.0
3,4,university-of-cambridge,United Kingdom,Europe,19875,36.9,12265,23.3,7610,58.9,5.0
4,5,university-of-california-berkeley,United States,North America,39874,17.7,29250,19.0,10624,13.9,5.0


In [216]:

label_encoder = LabelEncoder()
df['encoded_country'] = label_encoder.fit_transform(df['country'])
df['encoded_region'] = label_encoder.fit_transform(df['region'])




vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+(?:-\w+)+\b')
university_vectors = vectorizer.fit_transform(df['short_url']).toarray()
university_df = pd.DataFrame(university_vectors, columns=vectorizer.get_feature_names_out())


numerical_features = ['rating','total_enrollment', 'total_international_enrollment','ug_enrollment','international_ug_enrollment','pg_enrollment','international_pg_enrollment']  # Add more numerical features here...

# Min-max scaling for numerical features
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Concatenate processed dataframes
df_processed = pd.concat([university_df,df[['encoded_country', 'encoded_region']] ,df[numerical_features]], axis=1)

df_processed.head()




Unnamed: 0,aalborg-university,aalto-university,aarhus-university,adam-mickiewicz-university,addis-ababa-university,agh-university-of-science-and-technology,ain-shams-university,air-force-medical-university,aix-marseille-university,ajou-university,...,zhongnan-university-of-economics-and-law,encoded_country,encoded_region,rating,total_enrollment,total_international_enrollment,ug_enrollment,international_ug_enrollment,pg_enrollment,international_pg_enrollment
0,0,0,0,0,0,0,0,0,0,0,...,0,61,4,1.0,0.189046,0.024,0.044794,0.02752,0.743381,0.014978
1,0,0,0,0,0,0,0,0,0,0,...,0,61,4,1.0,0.072083,0.024,0.021431,0.029155,0.2471,0.025484
2,0,0,0,0,0,0,0,0,0,0,...,0,61,4,1.0,0.049079,0.0168,0.012934,0.03951,0.183249,0.013562
3,0,0,0,0,0,0,0,0,0,0,...,0,60,2,1.0,0.088895,0.0369,0.03791,0.063488,0.204938,0.04389
4,0,0,0,0,0,0,0,0,0,0,...,0,61,4,1.0,0.179475,0.0177,0.091011,0.051771,0.286448,0.010358


In [217]:
df["short_url"]

0                             harvard-university
1                            stanford-university
2      massachusetts-institute-of-technology-mit
3                        university-of-cambridge
4              university-of-california-berkeley
                         ...                    
995                          xinjiang-university
996                       xuzhou-medical-college
997                           yangtze-university
998                           yeshiva-university
999     zhongnan-university-of-economics-and-law
Name: short_url, Length: 1000, dtype: object

In [220]:
# Example: Calculate cosine similarity for the preprocessed dataset
cosine_sim = cosine_similarity(df_processed, df_processed)

# Function to get similar universities by name
def get_similar_universities_by_name(university_name):
    target_university_index = df.index[df['short_url'] == university_name].tolist()[0]
    similar_universities = df.iloc[cosine_sim[target_university_index].argsort()[::-1][1:]]  # Exclude the target university
    return similar_universities

# Example: Get similar universities for a specific university by name
target_university_name = 'university-of-copenhagen'  # Replace with the desired university name
similar_universities = get_similar_universities_by_name(target_university_name)

print(f"Similar universities for {target_university_name} based on cosine similarity:")
similar_universities["short_url"].head(20)

Similar universities for university-of-copenhagen based on cosine similarity:


248           the-university-of-auckland
476                  university-of-otago
493    victoria-university-of-wellington
463             university-of-canterbury
806                       aut-university
840                   lincoln-university
732                    massey-university
862            the-university-of-waikato
40                        psl-university
140               university-of-helsinki
46                   sorbonne-university
66                    university-of-bonn
60                  university-of-munich
193             university-of-strasbourg
262               university-of-bordeaux
188            university-of-montpellier
207     claude-bernard-university-lyon-1
54                 heidelberg-university
14               paris-saclay-university
59        technical-university-of-munich
Name: short_url, dtype: object

In [6]:
itemuserdata=pd.read_csv("item_user.csv")
itemuserdata.fillna(0,inplace=True)
itemuserdata.set_index("university_id",inplace=True)
itemuserdata.head(12)


Unnamed: 0_level_0,dc03c1f3-b756-48ad-9881-a49157ccebe1,dc03c1f3-b756-48ad-9881-a49157ccebe2,dc03c1f3-b756-48ad-9881-a49157ccebe3,dc03c1f3-b756-48ad-9881-a49157ccebe4,dc03c1f3-b756-48ad-9881-a49157ccebe5,dc03c1f3-b756-48ad-9881-a49157ccebe6,dc03c1f3-b756-48ad-9881-a49157ccebe7,dc03c1f3-b756-48ad-9881-a49157ccebe8,dc03c1f3-b756-48ad-9881-a49157ccebe9,dc03c1f3-b756-48ad-9881-a49157ccebe10,dc03c1f3-b756-48ad-9881-a49157ccebe11,dc03c1f3-b756-48ad-9881-a49157ccebe21
university_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5c7d4d9f-ea4d-43c9-9add-fcccb691b368,0,1,0,1,0,2,0,2,0,1,0,0
5247abb3-21f5-465b-8909-98fa81daa089,2,2,0,1,2,1,0,1,1,2,0,0
689832e4-924e-4ed9-930b-f8905af831bf,0,0,2,2,2,2,2,0,2,2,0,0
1cdcc453-9511-4c79-9774-7b23a0b9184f,0,2,1,0,2,1,0,1,0,1,0,0
9a9f31e0-1faa-49c2-9532-2fa0ae7f9794,2,2,0,1,2,2,1,2,2,2,0,0
0f9ef6ab-1b4b-433a-93a5-ce326a77040e,0,0,2,0,1,2,2,2,1,0,0,0
1ab9af5e-c5d7-4d39-be39-0c2da3b482e2,2,0,1,1,2,2,1,1,0,1,0,0
7c58b43b-12e2-48ca-bd2b-e6c90fb0a819,0,0,2,0,0,0,2,2,0,0,0,0
a1287bb5-f191-4ae5-a5a0-71e8311a0767,0,0,0,2,1,2,2,1,1,1,0,0
28794f8b-db90-4d14-85b5-5d960558d95e,0,0,0,0,2,0,2,1,1,1,0,0


In [286]:
itemuserdata=itemuserdata.transpose()
itemuserdata

In [7]:
user_similarity=itemuserdata.corr(method="pearson")

In [8]:

# Choose a new user for whom you want to generate city recommendations
target_user = 'dc03c1f3-b756-48ad-9881-a49157ccebe5'
target_index=itemuserdata.columns.get_loc(target_user)


similar_users = user_similarity[target_user].drop(target_user).sort_values(ascending=False).index   

# Initialize an empty DataFrame to store the recommended cities
recommended_universities=pd.DataFrame(columns=['university','count'])
# Iterate through similar users and find their visited cities that the target user hasn't visited
for user in similar_users:
    temp_df = pd.DataFrame({'university': itemuserdata[user].index, 'count': itemuserdata[user].values})
    recommended_universities = pd.concat([recommended_universities, temp_df], ignore_index=True)

recommended_universities.sort_values(by="count",ascending=False,inplace=True)
recommended_universities["university"][:9]


4123    4f425a97-4897-43f9-8714-b349277ea26f
2720    2a9c1d8a-f13b-43d7-ac0d-5c4d7b8675b6
6168    e0a0b174-d13c-47a5-9ecf-d053eca29ab5
2724    97c757bb-6f41-4645-806f-4819eba3976a
6164    d55f463e-98de-4f36-b42b-8e1ecd6046d3
2728    c6ae4b7d-921c-40a3-8e5c-119f0099b911
8943    90fe5bb6-f9b1-429e-84f0-3ee9bafe999c
6160    faddd992-e7a6-4474-accd-341070b8d02a
8945    e46bfade-d92f-4af9-82d2-6f0e1d258b70
Name: university, dtype: object