In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [13]:
# create dataframe from file
df = pd.read_csv("university_data.csv")
print(df.shape)
df.head()

(1534, 10)


Unnamed: 0,Name,Ranking,Ranking display,Applicants total,Admissions total,Enrolled total,Tuition and fees,Control of institution,Total price for in-state students living on campus,Total price for out-of-state students living on campus
0,Alabama A & M University,1501+,1500+,6142.0,5521.0,1104.0,7182.0,Public,21849.0,27441.0
1,University of Alabama at Birmingham,1501+,1500+,5689.0,4934.0,1773.0,7206.0,Public,22495.0,31687.0
2,Amridge University,1501+,1500+,,,,6870.0,Private not-for-profit,,
3,University of Alabama in Huntsville,1501+,1500+,2054.0,1656.0,651.0,9192.0,Public,23466.0,35780.0
4,Alabama State University,1501+,1500+,10245.0,5251.0,1479.0,8720.0,Public,18286.0,25222.0


In [14]:
# remove rows with any Nan values (removes ~210 rows, mostly from Ranking 1500+)
df = df.dropna()
print(df.shape)

(1326, 10)


### Correlation Matrix for top 1500 universities

In [15]:
topUniDf = df[df['Ranking display'] == 'top 1500'] \
                .drop(columns=['Name', 'Control of institution',
                               'Ranking', 'Ranking display'])

corrMatrixTopUni = topUniDf.corr(method='pearson')
corrMatrixTopUni.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Applicants total,Admissions total,Enrolled total,Tuition and fees,Total price for in-state students living on campus,Total price for out-of-state students living on campus
Applicants total,1.0,0.494714,0.437996,0.203781,0.226531,0.33494
Admissions total,0.494714,1.0,0.776986,-0.409575,-0.395452,-0.279795
Enrolled total,0.437996,0.776986,1.0,-0.578793,-0.576473,-0.420493
Tuition and fees,0.203781,-0.409575,-0.578793,1.0,0.994984,0.939693
Total price for in-state students living on campus,0.226531,-0.395452,-0.576473,0.994984,1.0,0.947792
Total price for out-of-state students living on campus,0.33494,-0.279795,-0.420493,0.939693,0.947792,1.0


### Correlation Matrix for universities ranked 1500+

In [16]:
nonTopUniDf = df[df['Ranking display'] == '1500+'] \
                .drop(columns=['Name', 'Control of institution',
                               'Ranking', 'Ranking display'])

corrMatrixNonTopUni = nonTopUniDf.corr(method='pearson')
corrMatrixNonTopUni.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Applicants total,Admissions total,Enrolled total,Tuition and fees,Total price for in-state students living on campus,Total price for out-of-state students living on campus
Applicants total,1.0,0.893984,0.795451,-0.127511,-0.058593,0.184762
Admissions total,0.893984,1.0,0.881048,-0.193764,-0.128237,0.123352
Enrolled total,0.795451,0.881048,1.0,-0.364408,-0.311752,-0.041665
Tuition and fees,-0.127511,-0.193764,-0.364408,1.0,0.983303,0.858547
Total price for in-state students living on campus,-0.058593,-0.128237,-0.311752,0.983303,1.0,0.907902
Total price for out-of-state students living on campus,0.184762,0.123352,-0.041665,0.858547,0.907902,1.0


### Correlation Matrix for all universities

In [17]:
def modifyRanking(ranking):
    if '–' in ranking:
        ranking = (int(ranking.split('–')[0]) + int(ranking.split('–')[1])) // 2
    elif '+' in ranking:
        ranking = ranking[:len(ranking)-1]
    return ranking

allUniWithRankDf = df.drop(columns=['Name', 'Control of institution', 'Ranking display'])
allUniWithRankDf['Ranking'] = df.apply(lambda row: modifyRanking(row['Ranking']), axis=1)
allUniWithRankDf['Ranking'] = pd.to_numeric(allUniWithRankDf['Ranking'])

corrMatrixAllUniWithRank = allUniWithRankDf.corr(method='pearson')
corrMatrixAllUniWithRank.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Ranking,Applicants total,Admissions total,Enrolled total,Tuition and fees,Total price for in-state students living on campus,Total price for out-of-state students living on campus
Ranking,1.0,-0.491359,-0.369911,-0.429239,-0.101574,-0.132223,-0.26777
Applicants total,-0.491359,1.0,0.853196,0.783967,-0.045747,0.02372,0.275459
Admissions total,-0.369911,0.853196,1.0,0.883784,-0.194212,-0.129273,0.142301
Enrolled total,-0.429239,0.783967,0.883784,1.0,-0.340887,-0.28701,0.008601
Tuition and fees,-0.101574,-0.045747,-0.194212,-0.340887,1.0,0.984668,0.855954
Total price for in-state students living on campus,-0.132223,0.02372,-0.129273,-0.28701,0.984668,1.0,0.903778
Total price for out-of-state students living on campus,-0.26777,0.275459,0.142301,0.008601,0.855954,0.903778,1.0
