In [1]:
!pip install --quiet geocoder
print('pip install geocoder complete.')

pip install geocoder complete.


In [2]:
import geocoder
import os.path
import pandas as pd

CODED = '/kaggle/working/coded.csv'
FIXES = {
    'UniversitÃ© PSL': 'Universite PSL',
    'Fudan University': 'National Fudan University'
}
WORLD = '/kaggle/input/top-100-universities-in-the-world-qs-ranking/top 100 world university 2024.csv'

def get(arg: str):
    json_result = geocoder.arcgis(location=arg).json
    if json_result:
        result = dict(json_result)
        return result['lat'], result['lng']
    else:
        return 0, 0

# geocoding is time-consuming and relies on an external server
# so we don't want to do it more than once if we can help it
if os.path.isfile(path=CODED):
    print('reading geocoded data from {}'.format(CODED))
    df = pd.read_csv(filepath_or_buffer=CODED, index_col=[0])
else:
    print('reading raw data from {}'.format(WORLD))
    df = pd.read_csv(filepath_or_buffer=WORLD, index_col=[0])
    # we need to apply a couple of name fixes to satisfy our ARCGIS server
    df['fixed_university'] = [item if item not in FIXES.keys() else FIXES[item] for item in df['university'].values]
    print('geocoding universities.')
    df['coordinates'] = df['fixed_university'].apply(func=get)
    df['latitude'], df['longitude'] = zip(*df['coordinates'])
    df = df.drop(columns=['fixed_university', 'coordinates'])
    # save the clean data for the next run
    df.to_csv(path_or_buf=CODED)
df.head()

reading raw data from /kaggle/input/top-100-universities-in-the-world-qs-ranking/top 100 world university 2024.csv
geocoding universities.


Unnamed: 0,rank,university,overall_score,academic_reputation,employer_reputation,faculty_student_ratio,citations_per_faculty,international_faculty_ratio,international_students_ratio,international_research_network,employment_outcomes,sustainability,latitude,longitude
0,1,Massachusetts Institute of Technology (MIT),100.0,100.0,100.0,100.0,100.0,100.0,88.2,94.3,100.0,95.2,42.35897,-71.0935
1,2,University of Cambridge,99.2,100.0,100.0,100.0,92.3,100.0,95.8,99.9,100.0,97.3,52.20525,0.11779
2,3,University of Oxford,98.9,100.0,100.0,100.0,90.6,98.2,98.2,100.0,100.0,97.8,51.75467,-1.25489
3,4,Harvard University,98.3,100.0,100.0,98.3,100.0,84.6,66.8,100.0,100.0,96.7,42.37588,-71.1234
4,5,Stanford University,98.1,100.0,100.0,100.0,99.9,99.9,51.2,95.8,100.0,94.4,37.42907,-122.16978


In [3]:
from plotly.express import histogram
for column in df.columns[2:-2]:
    histogram(data_frame=df, x=column).show()

In [4]:
from plotly.express import scatter_geo
scatter_geo(data_frame=df, lat='latitude', lon='longitude', hover_name='university', color='overall_score')

In [5]:
print('{} of the top 100 universities are north of the equator.'.format(len(df[df['latitude'] > 0])))
print('{} of the top 100 universities are east of the Greenwich Meridian'.format(len(df[df['longitude'] > 0])))

91 of the top 100 universities are north of the equator.
51 of the top 100 universities are east of the Greenwich Meridian


In [6]:
from arrow import now
from umap import UMAP
from plotly.express import scatter
umap_start = now()
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
factors_df = pd.DataFrame(data=umap_model.fit_transform(X=df[df.columns[3:12]]), columns=['u0', 'u1'])
factors_df[['overall_score', 'university']] = df[['overall_score', 'university']]
print('UMAP time: {}'.format(now() - umap_start))
scatter(data_frame=factors_df, x='u0', y='u1', color='overall_score', hover_name='university').update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)

UMAP time: 0:00:11.177099


What does this plot tell us? What are our priors? We would expect that all the high-scoring universities would cluster together when we project the higher-dimensional space of factors into two dimensions, and the lower-scoring universities would be dispersed in different ways because while they have lower component scores, they have lower scores in different ways. Instead we see that some universities with overall scores in the eighties cluster near the top-tier schools, while others are closer to schools in the lower tiers. 

This probably just suggests that not all component scores are weighted evenly.

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


TEST_SIZE = 0.25
RANDOM_STATE = 2024

X_train, X_test, y_train, y_test = train_test_split(df[df.columns[3:12]].copy(), df[df.columns[2]].values,
                                                    test_size=TEST_SIZE, random_state=RANDOM_STATE)
model = LinearRegression()
model.fit(X=X_train, y=y_train)
print('score: {:9.6f}, intercept: {:5.2f}'.format(model.score(X=X_train, y=y_train), model.intercept_))
print('R2: {:8.6f}'.format(r2_score(y_true=y_test, y_pred=model.predict(X=X_test))))
scatter(x=y_test, y=y_test - model.predict(X=X_test),).show()

score:  0.999994, intercept: -0.45
R2: 0.999993


Our errors seem pretty random especially if we look at multiple cases, varying both the test fraction and the random state.

In [8]:
# now let's look at the coefficients our little model gives us
coefficients_df = pd.DataFrame(model.coef_, columns=['coefficient'], index=X_train.columns)
histogram(data_frame=coefficients_df.reset_index(), x='index', y='coefficient')

We can eyeball these and they all look like multiples of five percent. But if we try to reproduce the overall score with these weights we still see small errors.

In [9]:
from plotly.express import line
weight_df = pd.DataFrame(pd.Series([0.3, 0.15, 0.1, 0.2, 0.05, 0.05, 0.05, 0.05, 0.05], index=df.columns[3:12], name=0))

post_hoc_df = df[df.columns[3:12]].dot(weight_df).rename(columns={0: 'post hoc'})
post_hoc_df['overall_score'] = df['overall_score'].copy()
post_hoc_df['model'] = df[df.columns[3:12]].dot(coefficients_df.rename(columns={'coefficient': 0}))
line(data_frame=post_hoc_df, y=post_hoc_df.columns, )

Our post hoc estimates give bigger errors for the top schools than for the small schools.

In [10]:
post_hoc_df.head()

Unnamed: 0,post hoc,overall_score,model
0,98.885,100.0,100.444977
1,98.11,99.2,99.658721
2,97.83,98.9,99.374914
3,97.235,98.3,98.767616
4,97.045,98.1,98.570726


We see two things here: one is that the top score is exactly 100; this may suggest that the factor scores are weighted and then normalized to make the top score 100. The other is that the overall score is reported to one decimal place, as are the component scores; this introduces some rounding error. We're going to let that little mystery stay a mystery.