In [1]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt

In [15]:
dataset = pd.read_csv("usnews_dataset.csv")
dataset.head()

Unnamed: 0,Name,Ranking,usnews.com link,Tuition and Fees,Room and Board,Total Enrollment,School Type,Year Founded,Religiious Affiliation,Academic Calendar,...,Total undergraduate enrollment,Undergraduates who are first generation,Out-of-state students,International students,Registered clubs and organizations,Number of sports,Unnamed: 76,Peer assessment score (/5),First-year student in top 10 percent of high school class,Social Mobility rank
0,Williams College,1,https://www.usnews.com/best-colleges/williams-...,"$57,280",14990,2127.0,"Private, Coed",1793.0,,04-01-04,...,2073.0,22%,86%,7.80%,154.0,17.0,,4.7,89%,90.0
1,Amherst College,2,https://www.usnews.com/best-colleges/amherst-c...,"$58,640",15310,1855.0,"Private, Coed",1821.0,,Semester,...,1855.0,21%,87%,8.10%,177.0,12.0,,4.6,88%,99.0
2,Swarthmore College,3,https://www.usnews.com/best-colleges/swarthmor...,"$54,656",16088,1559.0,"Private, Coed",1864.0,,Semester,...,1559.0,27%,87%,13.20%,154.0,8.0,,4.6,90%,148.0
3,Wellesley College,3,https://www.usnews.com/best-colleges/wellesley...,"$56,052",17096,2534.0,"Private, Women's college",1870.0,,Semester,...,2534.0,17%,86%,13.60%,153.0,8.0,,4.5,83%,113.0
4,Pomona College,5,https://www.usnews.com/best-colleges/pomona-co...,"$54,762",17218,1679.0,"Private, Coed",1887.0,,Semester,...,1679.0,30%,74%,11.40%,227.0,9.0,,4.5,91%,113.0


Filter the columns of interest + drop N/A rows:



In [76]:
usnews = dataset[['Name', 'Ranking', 'Peer assessment score (/5)', 'Total Enrollment', '2018 Endowment','Student-faculty ratio','Registered clubs and organizations','Number of sports','First-year student in top 10 percent of high school class']]
usnews = usnews.dropna()
usnews.head()

Unnamed: 0,Name,Ranking,Peer assessment score (/5),Total Enrollment,2018 Endowment,Student-faculty ratio,Registered clubs and organizations,Number of sports,First-year student in top 10 percent of high school class
0,Williams College,1,4.7,2127.0,"$2,600,000,000.00",7:01,154.0,17.0,89%
1,Amherst College,2,4.6,1855.0,"$2,400,000,000.00",7:01,177.0,12.0,88%
2,Swarthmore College,3,4.6,1559.0,"$2,100,000,000.00",8:01,154.0,8.0,90%
3,Wellesley College,3,4.5,2534.0,"$2,100,000,000.00",8:01,153.0,8.0,83%
4,Pomona College,5,4.5,1679.0,"$2,300,000,000.00",8:01,227.0,9.0,91%


**Clean data**

In [77]:
def endowment_format(s):
    s = s.replace(',','').split('.')[0]
    return s[1:]
def ratio_format(s):
    return s.split(":")[0]
def top10_format(s):
    return s[:-1]

usnews["2018 Endowment"] = usnews["2018 Endowment"].apply(endowment_format)
usnews["Student-faculty ratio"] = usnews["Student-faculty ratio"].apply(ratio_format)
usnews["First-year student in top 10 percent of high school class"] = usnews["First-year student in top 10 percent of high school class"].apply(top10_format)
usnews[["2018 Endowment", "Student-faculty ratio", "First-year student in top 10 percent of high school class"]] = usnews[["2018 Endowment", "Student-faculty ratio", "First-year student in top 10 percent of high school class"]].astype('float64')

In [78]:
# Data after cleaning
usnews.head()

Unnamed: 0,Name,Ranking,Peer assessment score (/5),Total Enrollment,2018 Endowment,Student-faculty ratio,Registered clubs and organizations,Number of sports,First-year student in top 10 percent of high school class
0,Williams College,1,4.7,2127.0,2600000000.0,7.0,154.0,17.0,89.0
1,Amherst College,2,4.6,1855.0,2400000000.0,7.0,177.0,12.0,88.0
2,Swarthmore College,3,4.6,1559.0,2100000000.0,8.0,154.0,8.0,90.0
3,Wellesley College,3,4.5,2534.0,2100000000.0,8.0,153.0,8.0,83.0
4,Pomona College,5,4.5,1679.0,2300000000.0,8.0,227.0,9.0,91.0


**Descriptive statistics**

In [38]:
usnews.describe()

Unnamed: 0,Ranking,Peer assessment score (/5),Total Enrollment,Registered clubs and organizations,Number of sports
count,168.0,168.0,168.0,168.0,168.0
mean,91.75,3.160119,1747.744048,99.797619,7.047619
std,57.402325,0.653057,797.258371,58.937149,5.574546
min,1.0,1.8,337.0,3.0,0.0
25%,45.75,2.7,1259.5,61.5,3.0
50%,89.0,3.1,1688.5,87.0,6.0
75%,132.0,3.6,2219.25,120.0,10.0
max,216.0,4.7,4512.0,347.0,25.0


In [73]:
usnews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168 entries, 0 to 214
Data columns (total 9 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   Name                                                       168 non-null    object 
 1   Ranking                                                    168 non-null    int64  
 2   Peer assessment score (/5)                                 168 non-null    float64
 3   Total Enrollment                                           168 non-null    float64
 4   2018 Endowment                                             168 non-null    float64
 5   Student-faculty ratio                                      168 non-null    float64
 6   Registered clubs and organizations                         168 non-null    float64
 7   Number of sports                                           168 non-null    float64
 8   First-year 

Create new variable

In [79]:
usnews["log(endowment per capita)"] = np.log(usnews["2018 Endowment"]/usnews['Total Enrollment'])
usnews["# clubs per capita"] = (usnews["Registered clubs and organizations"]/usnews['Total Enrollment'])
usnews["# sports per capita"] = (usnews["Number of sports"]/usnews['Total Enrollment'])

**Regression**