In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# create dataframe from file
df = pd.read_csv("university_data.csv")
print(df.shape)
df.head()

(1534, 10)


Unnamed: 0,Name,Ranking,Ranking display,Applicants total,Admissions total,Enrolled total,Tuition and fees,Control of institution,Total price for in-state students living on campus,Total price for out-of-state students living on campus
0,Alabama A & M University,1501+,1500+,6142.0,5521.0,1104.0,7182.0,Public,21849.0,27441.0
1,University of Alabama at Birmingham,1501+,1500+,5689.0,4934.0,1773.0,7206.0,Public,22495.0,31687.0
2,Amridge University,1501+,1500+,,,,6870.0,Private not-for-profit,,
3,University of Alabama in Huntsville,1501+,1500+,2054.0,1656.0,651.0,9192.0,Public,23466.0,35780.0
4,Alabama State University,1501+,1500+,10245.0,5251.0,1479.0,8720.0,Public,18286.0,25222.0


In [4]:
# remove rows with any Nan values (removes ~210 rows, mostly from Ranking 1500+)
df = df.dropna()
print(df.shape)

(1326, 10)


In [3]:
# Add % split of enrolled/applicants
df["Admission Percentage"] = df["Enrolled total"] / df["Applicants total"]

In [4]:
df.head()

Unnamed: 0,Name,Ranking,Ranking display,Applicants total,Admissions total,Enrolled total,Tuition and fees,Control of institution,Total price for in-state students living on campus,Total price for out-of-state students living on campus,Admission Percentage
0,Alabama A & M University,1501+,1500+,6142.0,5521.0,1104.0,7182.0,Public,21849.0,27441.0,0.179746
1,University of Alabama at Birmingham,1501+,1500+,5689.0,4934.0,1773.0,7206.0,Public,22495.0,31687.0,0.311654
2,Amridge University,1501+,1500+,,,,6870.0,Private not-for-profit,,,
3,University of Alabama in Huntsville,1501+,1500+,2054.0,1656.0,651.0,9192.0,Public,23466.0,35780.0,0.316943
4,Alabama State University,1501+,1500+,10245.0,5251.0,1479.0,8720.0,Public,18286.0,25222.0,0.144363


In [5]:
def modifyRanking(ranking):
    if '–' in ranking:
        ranking = (int(ranking.split('–')[0]) + int(ranking.split('–')[1])) // 2
    elif '+' in ranking:
        ranking = ranking[:len(ranking)-1]
    return ranking

### Correlation Matrix for top 1500 universities

In [16]:
topUniDf = df[df['Ranking display'] == 'top 1500'] \
                .drop(columns=['Name', 
                               'Control of institution', 
                               'Ranking display', 
                               'Applicants total', 
                               'Admissions total', 
                               'Enrolled total', 
                               'Total price for in-state students living on campus'])
topUniDf['Ranking'] = df.apply(lambda row: modifyRanking(row['Ranking']), axis=1)
topUniDf['Ranking'] = pd.to_numeric(topUniDf['Ranking'])

corrMatrixTopUni = topUniDf.corr(method='pearson')
corrMatrixTopUni.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Ranking,Tuition and fees,Total price for out-of-state students living on campus,Admission Percentage
Ranking,1.0,-0.443013,-0.589115,0.386431
Tuition and fees,-0.443013,1.0,0.939373,-0.781979
Total price for out-of-state students living on campus,-0.589115,0.939373,1.0,-0.777495
Admission Percentage,0.386431,-0.781979,-0.777495,1.0


### Correlation Matrix for universities ranked 1500+

In [32]:
nonTopUniDf = df[df['Ranking display'] == '1500+'] \
                .drop(columns=['Name', 
                               'Control of institution',
                               'Ranking',
                               'Ranking display',
                               'Applicants total', 
                               'Admissions total', 
                               'Enrolled total', 
                               'Total price for in-state students living on campus'])

corrMatrixNonTopUni = nonTopUniDf.corr(method='pearson')
corrMatrixNonTopUni.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Tuition and fees,Total price for out-of-state students living on campus,Admission Percentage
Tuition and fees,1.0,0.861053,-0.445597
Total price for out-of-state students living on campus,0.861053,1.0,-0.484636
Admission Percentage,-0.445597,-0.484636,1.0


### Correlation Matrix for all universities

In [33]:
allUniWithRankDf = df.drop(columns=['Name',
                                    'Control of institution',
                                    'Ranking display',
                                    'Applicants total', 
                                    'Admissions total', 
                                    'Enrolled total', 
                                    'Total price for in-state students living on campus'])

allUniWithRankDf['Ranking'] = df.apply(lambda row: modifyRanking(row['Ranking']), axis=1)
allUniWithRankDf['Ranking'] = pd.to_numeric(allUniWithRankDf['Ranking'])

corrMatrixAllUniWithRank = allUniWithRankDf.corr(method='pearson')
corrMatrixAllUniWithRank.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Ranking,Tuition and fees,Total price for out-of-state students living on campus,Admission Percentage
Ranking,1.0,-0.119804,-0.268746,0.127065
Tuition and fees,-0.119804,1.0,0.858242,-0.472126
Total price for out-of-state students living on campus,-0.268746,0.858242,1.0,-0.514047
Admission Percentage,0.127065,-0.472126,-0.514047,1.0
