In [18]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt

In [19]:
dataset = pd.read_csv("usnews_dataset.csv")
print("Raw data")
# Display first 5 rows
dataset.head(5)

Raw data


Unnamed: 0,Name,Ranking,usnews.com link,Tuition and Fees,Room and Board,Total Enrollment,School Type,Year Founded,Religiious Affiliation,Academic Calendar,...,Total undergraduate enrollment,Undergraduates who are first generation,Out-of-state students,International students,Registered clubs and organizations,Number of sports,Unnamed: 76,Peer_assessment,First_year_top_ten,Social Mobility rank
0,Williams College,1,https://www.usnews.com/best-colleges/williams-...,"$57,280",14990,2127.0,"Private, Coed",1793.0,,04-01-04,...,2073.0,22%,86%,7.80%,154.0,17.0,,4.7,89%,90.0
1,Amherst College,2,https://www.usnews.com/best-colleges/amherst-c...,"$58,640",15310,1855.0,"Private, Coed",1821.0,,Semester,...,1855.0,21%,87%,8.10%,177.0,12.0,,4.6,88%,99.0
2,Swarthmore College,3,https://www.usnews.com/best-colleges/swarthmor...,"$54,656",16088,1559.0,"Private, Coed",1864.0,,Semester,...,1559.0,27%,87%,13.20%,154.0,8.0,,4.6,90%,148.0
3,Wellesley College,3,https://www.usnews.com/best-colleges/wellesley...,"$56,052",17096,2534.0,"Private, Women's college",1870.0,,Semester,...,2534.0,17%,86%,13.60%,153.0,8.0,,4.5,83%,113.0
4,Pomona College,5,https://www.usnews.com/best-colleges/pomona-co...,"$54,762",17218,1679.0,"Private, Coed",1887.0,,Semester,...,1679.0,30%,74%,11.40%,227.0,9.0,,4.5,91%,113.0


**Filter the columns of interest + drop rows that contains null values (it drops from 216 down to 168)**
- Name
- Ranking 
- Peer_assessment 
- Total Enrollment 
- 2018 Endowment
- Student_faculty_ratio
- Registered clubs and organizations
- Number of sports
- First_year_top_ten


In [20]:
# Choose columns of interest
usnews = dataset[['Name', 'Ranking', 'Peer_assessment', 'Total Enrollment', '2018 Endowment','Student_faculty_ratio','Registered clubs and organizations','Number of sports','First_year_top_ten']]
# Drop rows which contains N/A values
usnews = usnews.dropna()
# Display first 5 rows
usnews.head(5)

Unnamed: 0,Name,Ranking,Peer_assessment,Total Enrollment,2018 Endowment,Student_faculty_ratio,Registered clubs and organizations,Number of sports,First_year_top_ten
0,Williams College,1,4.7,2127.0,"$2,600,000,000.00",7:01,154.0,17.0,89%
1,Amherst College,2,4.6,1855.0,"$2,400,000,000.00",7:01,177.0,12.0,88%
2,Swarthmore College,3,4.6,1559.0,"$2,100,000,000.00",8:01,154.0,8.0,90%
3,Wellesley College,3,4.5,2534.0,"$2,100,000,000.00",8:01,153.0,8.0,83%
4,Pomona College,5,4.5,1679.0,"$2,300,000,000.00",8:01,227.0,9.0,91%


**Data cleaning and engineering**

In [21]:
# Format 3 rows: endowment, student-faculty ratio and students from top 10 in high school
def endowment_format(s):
    s = s.replace(',','').split('.')[0]
    return s[1:]
def ratio_format(s):
    return s.split(":")[0]
def top10_format(s):
    return s[:-1]

usnews["2018 Endowment"] = usnews["2018 Endowment"].apply(endowment_format)
usnews["Student_faculty_ratio"] = usnews["Student_faculty_ratio"].apply(ratio_format)
usnews["First_year_top_ten"] = usnews["First_year_top_ten"].apply(top10_format)

# Change their type from object (string) to float
usnews[["2018 Endowment", "Student_faculty_ratio", "First_year_top_ten"]] = usnews[["2018 Endowment", "Student_faculty_ratio", "First_year_top_ten"]].astype('float64')

In [22]:
# Create new data: log_endowment_per_capita, clubs_per_capita, sports_per_capita
usnews["log_endowment_per_capita"] = np.log(usnews["2018 Endowment"]/usnews['Total Enrollment'])
usnews["clubs_per_capita"] = (usnews["Registered clubs and organizations"]/usnews['Total Enrollment'])
usnews["sports_per_capita"] = (usnews["Number of sports"]/usnews['Total Enrollment'])
usnews["First_year_top_ten"] = (usnews["First_year_top_ten"]/100)

In [31]:
# Data after cleaning, display 5 random rows
usnews.sample(5)

Unnamed: 0,Name,Ranking,Peer_assessment,Total Enrollment,2018 Endowment,Student_faculty_ratio,Registered clubs and organizations,Number of sports,First_year_top_ten,log_endowment_per_capita,clubs_per_capita,sports_per_capita
211,University of Wisconsin Superior,213,2.0,2601.0,17200000.0,16.0,59.0,10.0,0.08,8.796769,0.022684,0.003845
179,Gordon College,181,2.4,1955.0,56800000.0,11.0,120.0,4.0,0.23,10.276901,0.061381,0.002046
31,Scripps College,33,3.9,1067.0,362000000.0,10.0,300.0,8.0,0.69,12.734549,0.281162,0.007498
55,Agnes Scott College,58,3.4,1030.0,229400000.0,10.0,79.0,1.0,0.28,12.313664,0.076699,0.000971
122,Millsaps College,124,3.0,864.0,101500000.0,9.0,60.0,1.0,1.0,11.673997,0.069444,0.001157


**Descriptive statistics**

In [24]:
usnews.describe()

Unnamed: 0,Ranking,Peer_assessment,Total Enrollment,2018 Endowment,Student_faculty_ratio,Registered clubs and organizations,Number of sports,First_year_top_ten,log_endowment_per_capita,clubs_per_capita,sports_per_capita
count,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0
mean,91.75,3.160119,1747.744048,366430400.0,10.505952,99.797619,7.047619,0.366071,11.602095,0.061202,0.004297
std,57.402325,0.653057,797.258371,496916600.0,2.044408,58.937149,5.574546,0.230119,1.346933,0.034007,0.004967
min,1.0,1.8,337.0,109820.0,5.0,3.0,0.0,0.02,4.07451,0.007371,0.0
25%,45.75,2.7,1259.5,86025000.0,9.0,61.5,3.0,0.1975,11.066986,0.045039,0.00171
50%,89.0,3.1,1688.5,168950000.0,10.0,87.0,6.0,0.3,11.697339,0.056887,0.003469
75%,132.0,3.6,2219.25,392750000.0,11.0,120.0,10.0,0.53,12.352738,0.07117,0.005849
max,216.0,4.7,4512.0,2600000000.0,19.0,347.0,25.0,1.0,14.89432,0.281162,0.053908


In [25]:
usnews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168 entries, 0 to 214
Data columns (total 12 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Name                                168 non-null    object 
 1   Ranking                             168 non-null    int64  
 2   Peer_assessment                     168 non-null    float64
 3   Total Enrollment                    168 non-null    float64
 4   2018 Endowment                      168 non-null    float64
 5   Student_faculty_ratio               168 non-null    float64
 6   Registered clubs and organizations  168 non-null    float64
 7   Number of sports                    168 non-null    float64
 8   First_year_top_ten                  168 non-null    float64
 9   log_endowment_per_capita            168 non-null    float64
 10  clubs_per_capita                    168 non-null    float64
 11  sports_per_capita                   168 non-n

**Regression**

In [26]:
ranking = ols(formula = 'Ranking ~ log_endowment_per_capita + Student_faculty_ratio + First_year_top_ten + clubs_per_capita + sports_per_capita', data = usnews).fit()
ranking.summary()

0,1,2,3
Dep. Variable:,Ranking,R-squared:,0.755
Model:,OLS,Adj. R-squared:,0.747
Method:,Least Squares,F-statistic:,99.65
Date:,"Thu, 11 Jun 2020",Prob (F-statistic):,1.39e-47
Time:,23:03:14,Log-Likelihood:,-800.27
No. Observations:,168,AIC:,1613.0
Df Residuals:,162,BIC:,1631.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,249.7210,37.701,6.624,0.000,175.272,324.170
log_endowment_per_capita,-13.7081,2.397,-5.719,0.000,-18.442,-8.975
Student_faculty_ratio,4.3301,1.568,2.762,0.006,1.235,7.426
First_year_top_ten,-134.5447,12.626,-10.656,0.000,-159.477,-109.612
clubs_per_capita,123.1705,78.651,1.566,0.119,-32.143,278.484
sports_per_capita,-629.5501,512.972,-1.227,0.222,-1642.523,383.423

0,1,2,3
Omnibus:,24.134,Durbin-Watson:,1.138
Prob(Omnibus):,0.0,Jarque-Bera (JB):,40.591
Skew:,0.733,Prob(JB):,1.53e-09
Kurtosis:,4.911,Cond. No.,3630.0


In [27]:
peer = ols(formula = 'Peer_assessment ~ log_endowment_per_capita + Student_faculty_ratio + First_year_top_ten + clubs_per_capita + sports_per_capita', data = usnews).fit()
peer.summary()

0,1,2,3
Dep. Variable:,Peer_assessment,R-squared:,0.75
Model:,OLS,Adj. R-squared:,0.742
Method:,Least Squares,F-statistic:,97.15
Date:,"Thu, 11 Jun 2020",Prob (F-statistic):,6.460000000000001e-47
Time:,23:03:14,Log-Likelihood:,-49.88
No. Observations:,168,AIC:,111.8
Df Residuals:,162,BIC:,130.5
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.2354,0.433,2.853,0.005,0.380,2.091
log_endowment_per_capita,0.1192,0.028,4.330,0.000,0.065,0.174
Student_faculty_ratio,-0.0115,0.018,-0.638,0.525,-0.047,0.024
First_year_top_ten,1.9659,0.145,13.556,0.000,1.680,2.252
clubs_per_capita,-1.1707,0.903,-1.296,0.197,-2.955,0.613
sports_per_capita,3.2627,5.892,0.554,0.581,-8.372,14.898

0,1,2,3
Omnibus:,27.719,Durbin-Watson:,1.659
Prob(Omnibus):,0.0,Jarque-Bera (JB):,59.397
Skew:,-0.733,Prob(JB):,1.27e-13
Kurtosis:,5.518,Cond. No.,3630.0


**Compare the model with actual values**

In [28]:
predict_ranking = []
predict_peer_score = []
for i in range(len(usnews)):
    predict_ranking.append(round(
                           ranking.params['Intercept']+ \
                           ranking.params['log_endowment_per_capita']*usnews['log_endowment_per_capita'].values[i] + \
                           ranking.params['Student_faculty_ratio']*usnews['Student_faculty_ratio'].values[i] + \
                           ranking.params['First_year_top_ten']*usnews['First_year_top_ten'].values[i] + \
                           ranking.params['clubs_per_capita']*usnews['clubs_per_capita'].values[i] + \
                           ranking.params['sports_per_capita']*usnews['sports_per_capita'].values[i]))
    predict_peer_score.append(round(
                              peer.params['Intercept']+ \
                              peer.params['log_endowment_per_capita']*usnews['log_endowment_per_capita'].values[i] + \
                              peer.params['Student_faculty_ratio']*usnews['Student_faculty_ratio'].values[i] + \
                              peer.params['First_year_top_ten']*usnews['First_year_top_ten'].values[i] + \
                              peer.params['clubs_per_capita']*usnews['clubs_per_capita'].values[i] + \
                              peer.params['sports_per_capita']*usnews['sports_per_capita'].values[i],2))
    

**Save the comparasion between actual data and predicted data to .csv file**

In [29]:
(pd.DataFrame({"Name": usnews["Name"], "Ranking":usnews['Ranking'], "Model predicted ranking":predict_ranking, \
               "Peer Assessment":usnews["Peer_assessment"], "Model predicted peer assessment":predict_peer_score})).to_csv("model_predicted.csv")