In [4]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt

In [5]:
dataset = pd.read_csv("usnews_dataset.csv")
print("Raw data")
# Display first 5 rows
dataset.head(5)

Raw data


Unnamed: 0,Name,Ranking,usnews.com link,Tuition and Fees,Room and Board,Total Enrollment,School Type,Year Founded,Religiious Affiliation,Academic Calendar,...,Total undergraduate enrollment,Undergraduates who are first generation,Out-of-state students,International students,Registered clubs and organizations,Number of sports,Unnamed: 76,Peer_assessment,First_year_top_ten,Social Mobility rank
0,Williams College,1,https://www.usnews.com/best-colleges/williams-...,"$57,280",14990,2127.0,"Private, Coed",1793.0,,04-01-04,...,2073.0,22%,86%,7.80%,154.0,17.0,,4.7,89%,90.0
1,Amherst College,2,https://www.usnews.com/best-colleges/amherst-c...,"$58,640",15310,1855.0,"Private, Coed",1821.0,,Semester,...,1855.0,21%,87%,8.10%,177.0,12.0,,4.6,88%,99.0
2,Swarthmore College,3,https://www.usnews.com/best-colleges/swarthmor...,"$54,656",16088,1559.0,"Private, Coed",1864.0,,Semester,...,1559.0,27%,87%,13.20%,154.0,8.0,,4.6,90%,148.0
3,Wellesley College,3,https://www.usnews.com/best-colleges/wellesley...,"$56,052",17096,2534.0,"Private, Women's college",1870.0,,Semester,...,2534.0,17%,86%,13.60%,153.0,8.0,,4.5,83%,113.0
4,Pomona College,5,https://www.usnews.com/best-colleges/pomona-co...,"$54,762",17218,1679.0,"Private, Coed",1887.0,,Semester,...,1679.0,30%,74%,11.40%,227.0,9.0,,4.5,91%,113.0


**Filter the columns of interest + drop rows that contains null values (it drops from 216 down to 168)**
- Name
- Ranking 
- Peer_assessment 
- Total Enrollment 
- 2018 Endowment
- Student_faculty_ratio
- Registered clubs and organizations
- Number of sports
- First_year_top_ten


In [7]:
def filter_dataset(dataset):
    usnews = dataset[['Name', \
                      'Ranking', \
                      'Peer_assessment', \
                      'Total Enrollment', \
                      '2018 Endowment', \
                      'Student_faculty_ratio', \
                      'Registered clubs and organizations', \
                      'Number of sports','First_year_top_ten']]
    # Drop rows which contains N/A values
    return usnews.dropna()

**Data cleaning and engineering**

In [8]:
# Format 3 rows: endowment, student-faculty ratio and students from top 10 in high school
def endowment_format(s):
    s = s.replace(',','').split('.')[0]
    return float(s[1:])
def ratio_format(s):
    return int(s.split(":")[0])
def top10_format(s):
    return int(s[:-1])/100
def retention_format(s):
    return int(s[:-1])/100

def clean_data(usnews)
    usnews["2018 Endowment"] = usnews["2018 Endowment"].apply(endowment_format)
    usnews["Student_faculty_ratio"] = usnews["Student_faculty_ratio"].apply(ratio_format)
    usnews["First_year_top_ten"] = usnews["First_year_top_ten"].apply(top10_format)

    # Change their type from object (string) to float
#     usnews[["2018 Endowment", "Student_faculty_ratio", "First_year_top_ten"]] = usnews[["2018 Endowment", "Student_faculty_ratio", "First_year_top_ten"]].astype('float64')
    return usnews

SyntaxError: invalid syntax (<ipython-input-8-86fc6c0b4257>, line 12)

In [None]:
# Create new data: log_endowment_per_capita, clubs_per_capita, sports_per_capita
def new_variables(usnews):
    usnews["log_endowment_per_capita"] = np.log(usnews["2018 Endowment"]/usnews['Total Enrollment'])
    usnews["clubs_per_capita"] = (usnews["Registered clubs and organizations"]/usnews['Total Enrollment'])
    usnews["sports_per_capita"] = (usnews["Number of sports"]/usnews['Total Enrollment'])
#     usnews["First_year_top_ten"] = (usnews["First_year_top_ten"]/100)
    return usnews

In [None]:
# Data after cleaning, display 5 random rows
usnews.sample(5)

**Descriptive statistics**

In [None]:
usnews.describe()

In [None]:
usnews.info()

**Regression**

In [None]:
def regressions():
    ranking = ols(formula = 'Ranking ~ log_endowment_per_capita + Student_faculty_ratio + First_year_top_ten + clubs_per_capita + sports_per_capita', data = usnews).fit()
    peer = ols(formula = 'Peer_assessment ~ log_endowment_per_capita + Student_faculty_ratio + First_year_top_ten + clubs_per_capita + sports_per_capita', data = usnews).fit()
    return ranking, peer

**Compare the model with actual values**

In [9]:
def compare():
    predict_ranking = []
    predict_peer_score = []
    variables = ['log_endowment_per_capita', \
                 'Student_faculty_ratio', \
                 'First_year_top_ten', \
                 'clubs_per_capita', \
                 'sports_per_capita']
    for i in range(len(usnews)):
        predict_ranking.append(round(
                               ranking.params['Intercept']+ \
                               ranking.params['log_endowment_per_capita']*usnews['log_endowment_per_capita'].values[i] + \
                               ranking.params['Student_faculty_ratio']*usnews['Student_faculty_ratio'].values[i] + \
                               ranking.params['First_year_top_ten']*usnews['First_year_top_ten'].values[i] + \
                               ranking.params['clubs_per_capita']*usnews['clubs_per_capita'].values[i] + \
                               ranking.params['sports_per_capita']*usnews['sports_per_capita'].values[i]))
        predict_peer_score.append(round(
                                  peer.params['Intercept']+ \
                                  peer.params['log_endowment_per_capita']*usnews['log_endowment_per_capita'].values[i] + \
                                  peer.params['Student_faculty_ratio']*usnews['Student_faculty_ratio'].values[i] + \
                                  peer.params['First_year_top_ten']*usnews['First_year_top_ten'].values[i] + \
                                  peer.params['clubs_per_capita']*usnews['clubs_per_capita'].values[i] + \
                                  peer.params['sports_per_capita']*usnews['sports_per_capita'].values[i],2))
    return predict_ranking, predict_peer_score

**Save the comparasion between actual data and predicted data to .csv file**

In [10]:
def save():
    (pd.DataFrame({"Name": usnews["Name"], \
                   "Ranking":usnews['Ranking'], \
                   "Model predicted ranking":predict_ranking, \
                   "Peer Assessment":usnews["Peer_assessment"], \
                   "Model predicted peer assessment":predict_peer_score})).to_csv("model_predicted.csv")

In [11]:
usnews = filter_dataset(datase)
usnews = clean_data(usnews)
usnews = new_variable(usnews)
ranking, peer = regressions()
ranking.summary()


NameError: name 'usnews' is not defined