In [4]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt

In [5]:
dataset = pd.read_csv("usnews_dataset.csv")
print("Raw data")
# Display first 5 rows
dataset.head(5)

Raw data


Unnamed: 0,Name,Ranking,usnews.com link,Tuition and Fees,Room and Board,Total Enrollment,School Type,Year Founded,Religiious Affiliation,Academic Calendar,...,Total undergraduate enrollment,Undergraduates who are first generation,Out-of-state students,International students,Registered clubs and organizations,Number of sports,Unnamed: 76,Peer_assessment,First_year_top_ten,Social Mobility rank
0,Williams College,1,https://www.usnews.com/best-colleges/williams-...,"$57,280",14990,2127.0,"Private, Coed",1793.0,,04-01-04,...,2073.0,22%,86%,7.80%,154.0,17.0,,4.7,89%,90.0
1,Amherst College,2,https://www.usnews.com/best-colleges/amherst-c...,"$58,640",15310,1855.0,"Private, Coed",1821.0,,Semester,...,1855.0,21%,87%,8.10%,177.0,12.0,,4.6,88%,99.0
2,Swarthmore College,3,https://www.usnews.com/best-colleges/swarthmor...,"$54,656",16088,1559.0,"Private, Coed",1864.0,,Semester,...,1559.0,27%,87%,13.20%,154.0,8.0,,4.6,90%,148.0
3,Wellesley College,3,https://www.usnews.com/best-colleges/wellesley...,"$56,052",17096,2534.0,"Private, Women's college",1870.0,,Semester,...,2534.0,17%,86%,13.60%,153.0,8.0,,4.5,83%,113.0
4,Pomona College,5,https://www.usnews.com/best-colleges/pomona-co...,"$54,762",17218,1679.0,"Private, Coed",1887.0,,Semester,...,1679.0,30%,74%,11.40%,227.0,9.0,,4.5,91%,113.0


**Filter the columns of interest + drop rows that contains null values (it drops from 216 down to 168)**
- Name
- Ranking 
- Peer_assessment 
- Total Enrollment 
- 2018 Endowment
- Student_faculty_ratio
- Registered clubs and organizations
- Number of sports
- First_year_top_ten


In [27]:
def filter_dataset(dataset):
    usnews = dataset[['Name', \
                      'Ranking', \
                      'Peer_assessment', \
                      'Total Enrollment', \
                      '2018 Endowment', \
                      'Student_faculty_ratio', \
                      'Freshmen_retention', \
                      'First_year_top_ten']]
    # Drop rows which contains N/A values
    return usnews.dropna()

**Data cleaning and engineering**

In [28]:
# Format 3 rows: endowment, student-faculty ratio and students from top 10 in high school
def endowment_format(s):
    s = s.replace(',','').split('.')[0]
    return float(s[1:])
def ratio_format(s):
    return int(s.split(":")[0])
def top10_format(s):
    return int(s[:-1])/100
def retention_format(s):
    return int(s[:-1])/100

def clean_data(usnews):
    usnews["2018 Endowment"] = usnews["2018 Endowment"].apply(endowment_format)
    usnews["Student_faculty_ratio"] = usnews["Student_faculty_ratio"].apply(ratio_format)
    usnews["First_year_top_ten"] = usnews["First_year_top_ten"].apply(top10_format)
    usnews["Freshmen_retention"] = usnews["Freshmen_retention"].apply(retention_format)
    
    return usnews

def new_variables(usnews):
    usnews["log_endowment_per_capita"] = np.log(usnews["2018 Endowment"]/usnews['Total Enrollment'])
    return usnews

**Regression**

In [39]:
def regressions():
    ranking = ols(formula = 'Ranking ~ log_endowment_per_capita + \
                                       Student_faculty_ratio + \
                                       First_year_top_ten + \
                                       Freshmen_retention', data = usnews).fit()
    peer = ols(formula = 'Peer_assessment ~ log_endowment_per_capita + \
                                            Student_faculty_ratio + \
                                            First_year_top_ten + \
                                            Freshmen_retention', data = usnews).fit()
    
    ranking_endowment = ols(formula = 'Ranking ~ log_endowment_per_capita', data = usnews).fit()
    return ranking, peer, ranking_endowment

**Predict**

In [51]:
def predict(regressions):
    ranking = regressions[0]
    peer = regressions[1]
    ranking_endowment = regressions[2]
    
    predict_ranking = []
    predict_peer_score = []
    predict_ranking_endowment = []

    for i in range(len(usnews)):
        predict_ranking.append(round(
                               ranking.params['Intercept']+ \
                               ranking.params['log_endowment_per_capita']*usnews['log_endowment_per_capita'].values[i] + \
                               ranking.params['Student_faculty_ratio']*usnews['Student_faculty_ratio'].values[i] + \
                               ranking.params['First_year_top_ten']*usnews['First_year_top_ten'].values[i] + \
                               ranking.params['Freshmen_retention']*usnews['Freshmen_retention'].values[i] \
                               ))
        predict_peer_score.append(round(
                                  peer.params['Intercept']+ \
                                  peer.params['log_endowment_per_capita']*usnews['log_endowment_per_capita'].values[i] + \
                                  peer.params['Student_faculty_ratio']*usnews['Student_faculty_ratio'].values[i] + \
                                  peer.params['First_year_top_ten']*usnews['First_year_top_ten'].values[i] + \
                                  peer.params['Freshmen_retention']*usnews['Freshmen_retention'].values[i] \
                                  ,2))
        predict_ranking_endowment.append(round(
                                         ranking_endowment.params['Intercept'] + \
                                         ranking_endowment.params['log_endowment_per_capita']*usnews['log_endowment_per_capita'].values[i]))
    return predict_ranking, predict_peer_score, predict_ranking_endowment

**Save the comparasion between actual data and predicted data to .csv file**

In [47]:
def save(usnews, predict_ranking, predict_peer_score, predict_ranking_endowment):
    (pd.DataFrame({"Name": usnews["Name"], \
                   "Ranking":usnews['Ranking'], \
                   "Model predicted ranking":predict_ranking, \
                   "Model predicted ranking-endowment":predict_ranking_endowment, \
                   "Peer Assessment":usnews["Peer_assessment"], \
                   "Model predicted peer assessment":predict_peer_score})).to_csv("model_predicted_modified.csv")

**Pipeline**

In [31]:
usnews = filter_dataset(dataset)
usnews = clean_data(usnews)
usnews = new_variables(usnews)

**Descriptive statistics**

In [32]:
usnews.describe()

Unnamed: 0,Ranking,Peer_assessment,Total Enrollment,2018 Endowment,Student_faculty_ratio,Freshmen_retention,First_year_top_ten,log_endowment_per_capita
count,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0
mean,91.75,3.160119,1747.744048,366430400.0,10.505952,0.836548,0.366071,11.602095
std,57.402325,0.653057,797.258371,496916600.0,2.044408,0.102121,0.230119,1.346933
min,1.0,1.8,337.0,109820.0,5.0,0.49,0.02,4.07451
25%,45.75,2.7,1259.5,86025000.0,9.0,0.78,0.1975,11.066986
50%,89.0,3.1,1688.5,168950000.0,10.0,0.85,0.3,11.697339
75%,132.0,3.6,2219.25,392750000.0,11.0,0.92,0.53,12.352738
max,216.0,4.7,4512.0,2600000000.0,19.0,0.98,1.0,14.89432


In [54]:
ranking_regression, peer_regression, ranking_endowment_regression = regressions()
ranking_endowment_regression.summary()

0,1,2,3
Dep. Variable:,Ranking,R-squared:,0.522
Model:,OLS,Adj. R-squared:,0.519
Method:,Least Squares,F-statistic:,180.9
Date:,"Sat, 13 Jun 2020",Prob (F-statistic):,2.29e-28
Time:,09:37:58,Log-Likelihood:,-856.37
No. Observations:,168,AIC:,1717.0
Df Residuals:,166,BIC:,1723.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,448.8214,26.723,16.795,0.000,396.060,501.582
log_endowment_per_capita,-30.7765,2.288,-13.451,0.000,-35.294,-26.259

0,1,2,3
Omnibus:,16.828,Durbin-Watson:,1.025
Prob(Omnibus):,0.0,Jarque-Bera (JB):,44.386
Skew:,-0.323,Prob(JB):,2.3e-10
Kurtosis:,5.434,Cond. No.,102.0


In [55]:
ranking_regression.summary()

0,1,2,3
Dep. Variable:,Ranking,R-squared:,0.852
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,233.8
Date:,"Sat, 13 Jun 2020",Prob (F-statistic):,2.1200000000000001e-66
Time:,09:38:02,Log-Likelihood:,-758.05
No. Observations:,168,AIC:,1526.0
Df Residuals:,163,BIC:,1542.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,422.4986,33.405,12.648,0.000,356.536,488.461
log_endowment_per_capita,-9.0680,1.852,-4.897,0.000,-12.724,-5.412
Student_faculty_ratio,3.0216,1.169,2.584,0.011,0.713,5.330
First_year_top_ten,-60.7982,11.853,-5.130,0.000,-84.203,-37.394
Freshmen_retention,-280.9516,26.595,-10.564,0.000,-333.466,-228.437

0,1,2,3
Omnibus:,11.69,Durbin-Watson:,1.742
Prob(Omnibus):,0.003,Jarque-Bera (JB):,14.653
Skew:,0.468,Prob(JB):,0.000658
Kurtosis:,4.103,Cond. No.,346.0


In [56]:
peer_regression.summary()

0,1,2,3
Dep. Variable:,Peer_assessment,R-squared:,0.79
Model:,OLS,Adj. R-squared:,0.784
Method:,Least Squares,F-statistic:,153.0
Date:,"Sat, 13 Jun 2020",Prob (F-statistic):,4.31e-54
Time:,09:38:21,Log-Likelihood:,-35.341
No. Observations:,168,AIC:,80.68
Df Residuals:,163,BIC:,96.3
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0562,0.452,-0.124,0.901,-0.950,0.837
log_endowment_per_capita,0.0835,0.025,3.329,0.001,0.034,0.133
Student_faculty_ratio,0.0002,0.016,0.011,0.991,-0.031,0.031
First_year_top_ten,1.4177,0.161,8.832,0.000,1.101,1.735
Freshmen_retention,2.0643,0.360,5.731,0.000,1.353,2.776

0,1,2,3
Omnibus:,24.364,Durbin-Watson:,1.838
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38.559
Skew:,-0.767,Prob(JB):,4.24e-09
Kurtosis:,4.777,Cond. No.,346.0


In [52]:
predict_ranking, predict_peer_score, predict_ranking_endowment = \
                predict([ranking_regression, peer_regression, ranking_endowment_regression])

save(usnews, predict_ranking, predict_peer_score, predict_ranking_endowment)