In [30]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt

In [31]:
dataset = pd.read_csv("usnews_dataset.csv")
print("Raw data")
# Display first 5 rows
dataset.head(5)

Raw data


Unnamed: 0,Name,Ranking,usnews.com link,Tuition and Fees,Room and Board,Total Enrollment,School Type,Year Founded,Religiious Affiliation,Academic Calendar,...,Total undergraduate enrollment,Undergraduates who are first generation,Out-of-state students,International students,Registered clubs and organizations,Number of sports,Unnamed: 76,Peer_assessment,First_year_top_ten,Social Mobility rank
0,Williams College,1,https://www.usnews.com/best-colleges/williams-...,"$57,280",14990,2127.0,"Private, Coed",1793.0,,04-01-04,...,2073.0,22%,86%,7.80%,154.0,17.0,,4.7,89%,90.0
1,Amherst College,2,https://www.usnews.com/best-colleges/amherst-c...,"$58,640",15310,1855.0,"Private, Coed",1821.0,,Semester,...,1855.0,21%,87%,8.10%,177.0,12.0,,4.6,88%,99.0
2,Swarthmore College,3,https://www.usnews.com/best-colleges/swarthmor...,"$54,656",16088,1559.0,"Private, Coed",1864.0,,Semester,...,1559.0,27%,87%,13.20%,154.0,8.0,,4.6,90%,148.0
3,Wellesley College,3,https://www.usnews.com/best-colleges/wellesley...,"$56,052",17096,2534.0,"Private, Women's college",1870.0,,Semester,...,2534.0,17%,86%,13.60%,153.0,8.0,,4.5,83%,113.0
4,Pomona College,5,https://www.usnews.com/best-colleges/pomona-co...,"$54,762",17218,1679.0,"Private, Coed",1887.0,,Semester,...,1679.0,30%,74%,11.40%,227.0,9.0,,4.5,91%,113.0


**Filter the columns of interest + drop rows that contains null values (it drops from 216 down to 168)**
- Name
- Ranking 
- Peer_assessment 
- Total Enrollment 
- 2018 Endowment
- Student_faculty_ratio
- Freshmen_retention
- First_year_top_ten


In [32]:
def filter_dataset(dataset):
    usnews = dataset[['Name', \
                      'Ranking', \
                      'Peer_assessment', \
                      'Total Enrollment', \
                      '2018 Endowment', \
                      'Student_faculty_ratio', \
                      'Freshmen_retention', \
                      'First_year_top_ten',
                      'Acceptance_rate']]
    # Drop rows which contains N/A values
    return usnews.dropna()

**Data cleaning and engineering**

In [33]:
# Format 3 rows: endowment, student-faculty ratio and students from top 10 in high school
def endowment_format(s):
    s = s.replace(',','').split('.')[0]
    return float(s[1:])
def ratio_format(s):
    return int(s.split(":")[0])
def percentage_format(s):
    return int(s[:-1])/100

def clean_data(usnews):
    usnews["2018 Endowment"] = usnews["2018 Endowment"].apply(endowment_format)
    usnews["Student_faculty_ratio"] = usnews["Student_faculty_ratio"].apply(ratio_format)
    usnews["First_year_top_ten"] = usnews["First_year_top_ten"].apply(percentage_format)
    usnews["Freshmen_retention"] = usnews["Freshmen_retention"].apply(percentage_format)
    usnews["Acceptance_rate"] = usnews["Acceptance_rate"].apply(percentage_format)
    
    return usnews

def new_variables(usnews):
    usnews["endowment_per_capita"] = usnews["2018 Endowment"]/usnews['Total Enrollment']
    usnews["log_endowment_per_capita"] = np.log(usnews["endowment_per_capita"])
    return usnews

**Regression**

In [34]:
def regressions():
    ranking = ols(formula = 'Ranking ~ log_endowment_per_capita + \
                                       Student_faculty_ratio + \
                                       First_year_top_ten + \
                                       Acceptance_rate + \
                                       Freshmen_retention', data = usnews).fit()
    peer = ols(formula = 'Peer_assessment ~ log_endowment_per_capita + \
                                            Student_faculty_ratio + \
                                            First_year_top_ten + \
                                            Acceptance_rate + \
                                            Freshmen_retention', data = usnews).fit()
    
    ranking_endowment = ols(formula = 'Ranking ~ log_endowment_per_capita', data = usnews).fit()
    peer_endowment = ols(formula = 'Peer_assessment ~ log_endowment_per_capita', data = usnews).fit()
    return ranking, peer, ranking_endowment, peer_endowment

**Predict**

In [35]:
def predict(regressions):
    ranking = regressions[0]
    peer = regressions[1]
    ranking_endowment = regressions[2]
    peer_endowment = regressions[3]
    
    predict_ranking = []
    predict_peer_score = []
    predict_ranking_endowment = []
    predict_peer_endowment = []

    for i in range(len(usnews)):
        predict_ranking.append(round(
                               ranking.params['Intercept']+ \
                               ranking.params['log_endowment_per_capita']*usnews['log_endowment_per_capita'].values[i] + \
                               ranking.params['Student_faculty_ratio']*usnews['Student_faculty_ratio'].values[i] + \
                               ranking.params['First_year_top_ten']*usnews['First_year_top_ten'].values[i] + \
                               ranking.params['Freshmen_retention']*usnews['Freshmen_retention'].values[i] \
                               ))
        predict_peer_score.append(round(
                                  peer.params['Intercept']+ \
                                  peer.params['log_endowment_per_capita']*usnews['log_endowment_per_capita'].values[i] + \
                                  peer.params['Student_faculty_ratio']*usnews['Student_faculty_ratio'].values[i] + \
                                  peer.params['First_year_top_ten']*usnews['First_year_top_ten'].values[i] + \
                                  peer.params['Freshmen_retention']*usnews['Freshmen_retention'].values[i] \
                                  ,2))
        predict_ranking_endowment.append(round(
                                         ranking_endowment.params['Intercept'] + \
                                         ranking_endowment.params['log_endowment_per_capita']*usnews['log_endowment_per_capita'].values[i]))
        predict_peer_endowment.append(round(
                                      peer_endowment.params['Intercept'] + \
                                      peer_endowment.params['log_endowment_per_capita']*usnews['log_endowment_per_capita'].values[i],2))
    return predict_ranking, predict_peer_score, predict_ranking_endowment, predict_peer_endowment

**Save the comparasion between actual data and predicted data to .csv file**

In [36]:
def save(usnews, predict_ranking, predict_peer_score, predict_ranking_endowment, predict_peer_endowment):
    (pd.DataFrame({"Name": usnews["Name"], \
                   "Actual Ranking":usnews['Ranking'], \
                   "Model predicted ranking":predict_ranking, \
                   "Model predicted ranking by log_endowment_per_capita":predict_ranking_endowment, \
                   "Actual Peer Assessment":usnews["Peer_assessment"], \
                   "Model predicted peer assessment":predict_peer_score, \
                   "Model predicted peer_assessment by log_endowment_per_capita":predict_peer_endowment, \
                   })).to_csv("model_predicted_modified.csv")

**Pipeline**

In [37]:
usnews = filter_dataset(dataset)
usnews = clean_data(usnews)
usnews = new_variables(usnews)

**Save model**

In [38]:
predict_ranking, predict_peer_score, predict_ranking_endowment, predict_peer_endowment = \
                predict([ranking_regression, peer_regression, ranking_endowment_regression, peer_endowment_regression])

save(usnews, predict_ranking, predict_peer_score, predict_ranking_endowment, predict_peer_endowment)
print("Saved")

Saved


**Descriptive statistics**

In [39]:
usnews.describe()

Unnamed: 0,Ranking,Peer_assessment,Total Enrollment,2018 Endowment,Student_faculty_ratio,Freshmen_retention,First_year_top_ten,Acceptance_rate,endowment_per_capita,log_endowment_per_capita
count,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0
mean,91.75,3.160119,1747.744048,366430400.0,10.505952,0.836548,0.366071,0.576429,219488.5,11.602095
std,57.402325,0.653057,797.258371,496916600.0,2.044408,0.102121,0.230119,0.23087,330965.7,1.346933
min,1.0,1.8,337.0,109820.0,5.0,0.49,0.02,0.08,58.82164,4.07451
25%,45.75,2.7,1259.5,86025000.0,9.0,0.78,0.1975,0.405,64025.69,11.066986
50%,89.0,3.1,1688.5,168950000.0,10.0,0.85,0.3,0.635,120251.4,11.697339
75%,132.0,3.6,2219.25,392750000.0,11.0,0.92,0.53,0.74,231593.5,12.352738
max,216.0,4.7,4512.0,2600000000.0,19.0,0.98,1.0,1.0,2941176.0,14.89432


In [40]:
ranking_regression, peer_regression, ranking_endowment_regression, peer_endowment_regression = regressions()
print('Ranking vs. log endowment result')
ranking_endowment_regression.summary()

Ranking vs. log endowment result


0,1,2,3
Dep. Variable:,Ranking,R-squared:,0.522
Model:,OLS,Adj. R-squared:,0.519
Method:,Least Squares,F-statistic:,180.9
Date:,"Wed, 17 Jun 2020",Prob (F-statistic):,2.29e-28
Time:,14:18:51,Log-Likelihood:,-856.37
No. Observations:,168,AIC:,1717.0
Df Residuals:,166,BIC:,1723.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,448.8214,26.723,16.795,0.000,396.060,501.582
log_endowment_per_capita,-30.7765,2.288,-13.451,0.000,-35.294,-26.259

0,1,2,3
Omnibus:,16.828,Durbin-Watson:,1.025
Prob(Omnibus):,0.0,Jarque-Bera (JB):,44.386
Skew:,-0.323,Prob(JB):,2.3e-10
Kurtosis:,5.434,Cond. No.,102.0


In [41]:
print('Peer assessment vs. log endowment result')
peer_endowment_regression.summary()

Peer assessment vs. log endowment result


0,1,2,3
Dep. Variable:,Peer_assessment,R-squared:,0.433
Model:,OLS,Adj. R-squared:,0.429
Method:,Least Squares,F-statistic:,126.7
Date:,"Wed, 17 Jun 2020",Prob (F-statistic):,3.33e-22
Time:,14:18:51,Log-Likelihood:,-118.65
No. Observations:,168,AIC:,241.3
Df Residuals:,166,BIC:,247.5
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.5410,0.331,-1.635,0.104,-1.195,0.112
log_endowment_per_capita,0.3190,0.028,11.257,0.000,0.263,0.375

0,1,2,3
Omnibus:,7.599,Durbin-Watson:,1.162
Prob(Omnibus):,0.022,Jarque-Bera (JB):,13.634
Skew:,-0.079,Prob(JB):,0.00109
Kurtosis:,4.387,Cond. No.,102.0


In [42]:
print('Ranking vs. other factors result')
ranking_regression.summary()

Ranking vs. other factors result


0,1,2,3
Dep. Variable:,Ranking,R-squared:,0.861
Model:,OLS,Adj. R-squared:,0.857
Method:,Least Squares,F-statistic:,201.1
Date:,"Wed, 17 Jun 2020",Prob (F-statistic):,1.4999999999999998e-67
Time:,14:18:51,Log-Likelihood:,-752.39
No. Observations:,168,AIC:,1517.0
Df Residuals:,162,BIC:,1536.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,379.3777,34.847,10.887,0.000,310.564,448.191
log_endowment_per_capita,-8.1229,1.818,-4.469,0.000,-11.713,-4.533
Student_faculty_ratio,2.9572,1.134,2.607,0.010,0.718,5.197
First_year_top_ten,-34.9256,13.836,-2.524,0.013,-62.247,-7.604
Acceptance_rate,38.8100,11.550,3.360,0.001,16.002,61.618
Freshmen_retention,-279.7671,25.795,-10.846,0.000,-330.706,-228.829

0,1,2,3
Omnibus:,8.932,Durbin-Watson:,1.713
Prob(Omnibus):,0.011,Jarque-Bera (JB):,10.064
Skew:,0.41,Prob(JB):,0.00653
Kurtosis:,3.874,Cond. No.,361.0


In [43]:
print('Peer assessment vs. other factors result')
peer_regression.summary()

Peer assessment vs. other factors result


0,1,2,3
Dep. Variable:,Peer_assessment,R-squared:,0.824
Model:,OLS,Adj. R-squared:,0.818
Method:,Least Squares,F-statistic:,151.4
Date:,"Wed, 17 Jun 2020",Prob (F-statistic):,3.7000000000000004e-59
Time:,14:18:51,Log-Likelihood:,-20.503
No. Observations:,168,AIC:,53.01
Df Residuals:,162,BIC:,71.75
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8644,0.447,1.934,0.055,-0.018,1.747
log_endowment_per_capita,0.0633,0.023,2.716,0.007,0.017,0.109
Student_faculty_ratio,0.0015,0.015,0.107,0.915,-0.027,0.030
First_year_top_ten,0.8653,0.177,4.877,0.000,0.515,1.216
Acceptance_rate,-0.8286,0.148,-5.595,0.000,-1.121,-0.536
Freshmen_retention,2.0391,0.331,6.164,0.000,1.386,2.692

0,1,2,3
Omnibus:,22.357,Durbin-Watson:,1.94
Prob(Omnibus):,0.0,Jarque-Bera (JB):,33.561
Skew:,-0.732,Prob(JB):,5.15e-08
Kurtosis:,4.628,Cond. No.,361.0
