In [221]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler


df=pd.read_csv("exported.csv")
df.head()

Unnamed: 0,counter,short_url,country,region,total_enrollment,total_international_enrollment,ug_enrollment,international_ug_enrollment,pg_enrollment,international_pg_enrollment,rating
0,1,harvard-university,United States,North America,41987,24.0,14467,10.1,27520,20.1,5.0
1,2,stanford-university,United States,North America,16163,24.0,6994,10.7,9169,34.2,5.0
2,3,massachusetts-institute-of-technology-mit,United States,North America,11084,16.8,4276,14.5,6808,18.2,5.0
3,4,university-of-cambridge,United Kingdom,Europe,19875,36.9,12265,23.3,7610,58.9,5.0
4,5,university-of-california-berkeley,United States,North America,39874,17.7,29250,19.0,10624,13.9,5.0


In [216]:

label_encoder = LabelEncoder()
df['encoded_country'] = label_encoder.fit_transform(df['country'])
df['encoded_region'] = label_encoder.fit_transform(df['region'])




vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+(?:-\w+)+\b')
university_vectors = vectorizer.fit_transform(df['short_url']).toarray()
university_df = pd.DataFrame(university_vectors, columns=vectorizer.get_feature_names_out())


numerical_features = ['rating','total_enrollment', 'total_international_enrollment','ug_enrollment','international_ug_enrollment','pg_enrollment','international_pg_enrollment']  # Add more numerical features here...

# Min-max scaling for numerical features
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Concatenate processed dataframes
df_processed = pd.concat([university_df,df[['encoded_country', 'encoded_region']] ,df[numerical_features]], axis=1)

df_processed.head()




Unnamed: 0,aalborg-university,aalto-university,aarhus-university,adam-mickiewicz-university,addis-ababa-university,agh-university-of-science-and-technology,ain-shams-university,air-force-medical-university,aix-marseille-university,ajou-university,...,zhongnan-university-of-economics-and-law,encoded_country,encoded_region,rating,total_enrollment,total_international_enrollment,ug_enrollment,international_ug_enrollment,pg_enrollment,international_pg_enrollment
0,0,0,0,0,0,0,0,0,0,0,...,0,61,4,1.0,0.189046,0.024,0.044794,0.02752,0.743381,0.014978
1,0,0,0,0,0,0,0,0,0,0,...,0,61,4,1.0,0.072083,0.024,0.021431,0.029155,0.2471,0.025484
2,0,0,0,0,0,0,0,0,0,0,...,0,61,4,1.0,0.049079,0.0168,0.012934,0.03951,0.183249,0.013562
3,0,0,0,0,0,0,0,0,0,0,...,0,60,2,1.0,0.088895,0.0369,0.03791,0.063488,0.204938,0.04389
4,0,0,0,0,0,0,0,0,0,0,...,0,61,4,1.0,0.179475,0.0177,0.091011,0.051771,0.286448,0.010358


In [217]:
df["short_url"]

0                             harvard-university
1                            stanford-university
2      massachusetts-institute-of-technology-mit
3                        university-of-cambridge
4              university-of-california-berkeley
                         ...                    
995                          xinjiang-university
996                       xuzhou-medical-college
997                           yangtze-university
998                           yeshiva-university
999     zhongnan-university-of-economics-and-law
Name: short_url, Length: 1000, dtype: object

In [220]:
# Example: Calculate cosine similarity for the preprocessed dataset
cosine_sim = cosine_similarity(df_processed, df_processed)

# Function to get similar universities by name
def get_similar_universities_by_name(university_name):
    target_university_index = df.index[df['short_url'] == university_name].tolist()[0]
    similar_universities = df.iloc[cosine_sim[target_university_index].argsort()[::-1][1:]]  # Exclude the target university
    return similar_universities

# Example: Get similar universities for a specific university by name
target_university_name = 'university-of-copenhagen'  # Replace with the desired university name
similar_universities = get_similar_universities_by_name(target_university_name)

print(f"Similar universities for {target_university_name} based on cosine similarity:")
similar_universities["short_url"].head(20)

Similar universities for university-of-copenhagen based on cosine similarity:


248           the-university-of-auckland
476                  university-of-otago
493    victoria-university-of-wellington
463             university-of-canterbury
806                       aut-university
840                   lincoln-university
732                    massey-university
862            the-university-of-waikato
40                        psl-university
140               university-of-helsinki
46                   sorbonne-university
66                    university-of-bonn
60                  university-of-munich
193             university-of-strasbourg
262               university-of-bordeaux
188            university-of-montpellier
207     claude-bernard-university-lyon-1
54                 heidelberg-university
14               paris-saclay-university
59        technical-university-of-munich
Name: short_url, dtype: object