In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

# Comparing Models to Predict March Madness Rankings


March Madness, also known as the NCAA Division I Men's Basketball Tournament happens annually in the month of March. Depending on which teams performed the best in the season, the top 32 are selected to compete in the tournament and play each other in a bracket for the winners trophy. Although March Madness 2018 is already over, our team wanted to see which model would do a better job at predicting team rankings/winners. We decided to look at the Elo Model as well as use linear regression with features we extracted in order to find a trend. 

### Scraping the Data

In order to scrape data, we used www.sports-reference.com/cbb (cbb = college basketball). We utilized the BeautifulSoup library to extract the features that we thought we would need for both models. In the following code the features extracted are in the _featuresWanted_ set. A typical page that we would scrape from looks like the following: 


<img src="files/cbbstatsex.png">

This data displays Villanova's game history for the year 2018 [found here](https://www.sports-reference.com/cbb/schools/villanova/2018-schedule.html). We used Beautiful Soup to gather all the table data and format it in a data frame. Because the scraping usually takes ~10 minutes, the code was run once and put into a csv file, which we later used to do our data analysis. 


In [None]:
def getSchools():
    url = "https://www.sports-reference.com/cbb/seasons/2018-school-stats.html"
    page = urlopen(url).read()
    soup = BeautifulSoup(page)
    count  = 0
    table = soup.find("tbody")
    school_dict = dict()
    for row in table.findAll('td', {"data-stat": "school_name"}):
        school_name = row.getText()
        for a in row.find_all('a', href=True):
            link = a['href'].strip()
            name = link[13:].split("/")[0]
            school_dict[name] = school_name
            
    return school_dict

def getDfs():
    school_set = getSchools()
    dfs = []
    final_df=pd.DataFrame()
    for school in school_set: 
        url = "https://www.sports-reference.com/cbb/schools/" + school + "/2018-schedule.html"
        page = urlopen(url).read()
        soup = BeautifulSoup(page)
        count = 0 
        pre_df = dict()
        school_set = getSchools()
        table = soup.find("tbody")
        featuresWanted =  {'opp_name', 'pts', 'opp_pts', 
                           'game_location','game_result','overtimes','wins','losses', 'date_game'} #add more features here!!

        rows = table.findChildren(['tr'])
        for row in rows:
            if (row.find('th', {"scope":"row"}) != None):

                for f in featuresWanted:
                    cell = row.find("td",{"data-stat": f})

                    a = cell.text.strip().encode()
                    text=a.decode("utf-8")
                    if f in pre_df:
                        pre_df[f].append(text)
                    else:
                        pre_df[f]=[text]
            
        df = pd.DataFrame.from_dict(pre_df)
        df["opp_name"]= df["opp_name"].apply(lambda row: (row.split("(")[0]).rstrip())
        df["school_name"]=school_set[school]
        df["school_name"] = df["school_name"].apply(lambda row: (row.split("(")[0]).rstrip())
        final_df=pd.concat([final_df,df])
    return final_df


def csvDump():
    df=getDfs()
    df.to_csv("scraped_data.csv")
    
    
csvDump()

After creating the csv, our csv (in this same folder, called scraped_data.csv contained data about all games that were played in the 2017-2018 season.

## The Elo Model 

The Elo Model is a way of creating a rating system for zero-sum games - games that only have one winner and one loser (e.g. basketball, hockey, football, tennis, etc.) The system uses the following method:

The algorithm works in the following way: 

Each team begins with the same ranking. The standard across most sports is ~1000-1500. We started out with *1200*, which was a common trend amongst others across the internet who had also used Elo Rankings for other sports. We then calculate the probability of each team winning with the following equation:

**Team1 Probability = (1.0 / (1.0 + 10^((Team1_Rating – Team2_Rating) / 400)))**

**Team2 Probability  = (1.0 / (1.0 + 10^((Team2_Rating – Team1_Rating) / 400)))**


We can see that Team1 Probabilty + Team2 Probabiilty = 1.0. The '400' is a standardized constant in Elo Rankings[(1)](https://en.wikipedia.org/wiki/Elo_rating_system)

When a game is played, we can update the rankings of both teams using the following equation: 

**Team1_Rating = Team1_Rating + K*(Team1_Score – Team1_Probability)**

**Team2_Rating = Team2_Rating + K*(Team2_Score – Team2_Probability)**

Here, the scores are determined by the outcome of the game:

win = 1.0
draw = 0.5
loss = 0.0

The K factor is a numerical value that "determines how much the Elo rating should change following a match result"[(2)](www.betfair.com.au). Across literature and the internet, a common k-factor for basketball has been 20 (Used by FiveThirtyEight and others). We can actually create a K=factor that depeonds on the nubmer of matches played. (More on this later). 


The following Elo class creates an Elo ranking for each team and updates it everytime a game is played. It will be used for data analysis on the data we scraped earlier.  

In [2]:
'''
WIN = 1.
DRAW = 0.5
LOSS = 0.

https://www.geeksforgeeks.org/elo-rating-algorithm/

'''
#: Default K-factor.
K_FACTOR = 25
#: Default rating class.
RATING_CLASS = float
#: Default initial rating.
INITIAL_RATING = 1200
#: Default Beta value.
BETA = 200


class Elo(object):
    #initialize object
    def __init__(self, teamName, kFactor = K_FACTOR, rating = INITIAL_RATING, beta = BETA):
        self.teamName = teamName
        self.kFactor = kFactor
        self.rating = rating 
        self.pWin = None
        self.beta = 2*BETA
        self.matches = 0 

    def calcPWin(self, oppRating): #expected
        pwin = 1/(1+1000.00**((self.rating - oppRating)/self.beta))
        self.pWin = pwin
        return pwin

    def game(self, outcome, oppRating): #1 for win, 0 for loss, 
        pwin =self.calcPWin(oppRating)
        self.rating = self.rating - self.kFactor*(outcome - pwin)
        self.matches+=1
        return True

    def getPWin(self):
        return self.pWin

    def getRating(self):
        return self.rating

    def setKFactor(self, k):
        self.kFactor = k 

## Computing the ELOs ## 

We computed the ELOs for every team by instantiating ELO class objects for all the major NCAA schools in the main function. Then we iterate through the data frame of games that we scraped, and updated the ELO value for all of the major schools for all regular season games. After sorting based on the ELO values, we updated the rankings and used this dictionary as our predicted outcomes of the March Madness tournament. 


The following includes high level steps of the main function: 
	1. We first clean the school name to remove, necessary characters
	2. Iterate through every game
        a. Update the home and opp team based their respective team ratings and their W/Ls 
	4. After we have the final ELOs, we sort the list based on their ELO ranking 

In [4]:
import pandas as pd
from datetime import datetime

def removeNCAA(x):
    #removes unnecessary characters
    if("NCAA" in x):
        return x[:-5]
    else:
        return x

def ranking(schoolDictionary): 
    outputs = []
    for key in schoolDictionary: 
        eloObject = schoolDictionary[key]
        item = (eloObject.teamName, eloObject.rating)
        outputs.append(item)
    return sorted(outputs,  key=lambda x: x[1])
     
    
def runELO(): 
    df = pd.read_csv("scraped_data.csv")
    df.drop(['Unnamed: 0'], axis = 1, inplace=True)
   # df['date_game'] =pd.to_datetime(df.date_game)
    df["school_name"].apply(removeNCAA)
    schoolDict = {} 
    schools = set(df['school_name'])


    for school in schools: 
        if school not in schoolDict: 
            schoolDict[school] = Elo(school)
    for index, row in df.iterrows(): 
        homeSchool = row["school_name"]
        oppSchool = row["opp_name"]
        if oppSchool not in schoolDict:
            #even if the opponent was not in the schools dictionary, then the opponent is not a major school
            #assigned this team a rating of 100, so that this win is not impact our ELO rating as a much as a major school would 
            oppRating = 100
            oppObj = None 
        else: 
            oppObj = schoolDict[oppSchool]
            oppRating = oppObj.getRating()
        #getting the rating before we update with the outcome, so that we can use the before game ELO to correctly 
        #reflect the outcome for the opponent after changing home team win
        
        schoolObj = schoolDict[homeSchool]
        schoolRating = schoolObj.getRating() 
    
        result = row["game_result"]
        if result == 'W': 
            schoolObj.game(1, oppRating)
            if oppObj != None: 
                oppObj.game(0, schoolRating)
        else: 
            schoolObj.game(0, oppRating)
            if oppObj != None: 
                oppObj.game(1, schoolRating)
        schoolDict[homeSchool] = schoolObj
    ranks = ranking(schoolDict)
    d = dict()
    for i in range(len(ranks)):
        d[ranks[i]]= i+1
    return d
        
    
d1 = runELO()


## Evaluating the Quality of ELO ## 

We used the output from our runELO function as the ELO's predicted data and computed its MSE against the actual March Madness data which we got from the get_original_MM_rankings() function. To determine the quality of this model, we decided to compute the MSE between the Associated Press’s ranking predictions and the actual. We got Associated Press's ranking predictions by running get_original_AP_rankings(). 

We then determined the quality of the model by finding the mean squared errors between the predicted and actual rankings. The mean squared algorithm follows the following steps to determine how well the predicted rankings fit the actual rankings. 

    1. Go through the teams in predicted data  
    2. Computed the difference between the actual and the predicted data, squared this value, and added it to a list 
    3. After all the squared errors have been computed, take the average of the list 

In [5]:
def get_original_MM_rankings():
    MM = {1: "Villanova", 2:"Michigan", 3: "Loyola (IL)", 4: "Kansas", 5: "Duke", 6: "Florida State",
         7: "Kansas State", 8: "Texas Tech", 9: "Nevada", 10: "Kentucky", 11: "Clemson", 12: "Syracuse",
         13: "Texas Tech", 14: "Gonzaga", 15: "West Virginia", 16: "Texas A&M", 17:"Houston", 18: "Tennessee",19: "Cincinnati",
        20: "Michigan",21: "Butler", 22: "Florida", 23: "Seton Hall",24: "Xavier",25: "Ohio State",
          26: "Maryland-Baltimore County", 27: "Rhode Island",28: "Buffalo", 29: "North Carolina",30: "Marshall", 31: "Alabama",32:"Auburn" } 

    res = {val:key for (key, val) in MM.items()}
    return res

def get_original_AP_rankings():
    AP = {1: "Virginia", 2:"Villanova", 3: "Xavier", 4: "Kansas", 5: "Michigan State", 6: "Cincinnati",
         7: "Michigan", 8: "Gonzaga", 9: "Duke", 10: "North Carolina", 11: "Purdue", 12: "Arizona",
         13: "Tennessee", 14: "Texas Tech", 15: "West Virginia", 16: "Wichita State", 
          17:"Ohio State", 18: "Kentucky",19: "Auburn",
        20: "Clemson",21: "Houston", 22: "Miami (FL)", 23: "Florida",24: "Nevada",25: "Saint Mary's (CA)"}
    {val:key for (key, val) in AP.items()}
    return AP 

In [6]:
def mseELO(n):
    #predicted data from the main function 
    predicted = {val:key for (key, val) in d1.items()}
    #actual data from the dictionary returned 
    actual = get_original_MM_rankings()
    ranks = []
    for i in range(n):
        rank = i+1 
        (school,score) = predicted[rank]
        school = (removeNCAA(school))
        actualRankings = {val:key for (key, val) in actual.items()}
        if school not in actual: 
            #since the school is not in the top 25 March Madness teams after the tournament 
            #assign the actual rank as n+1, which is the maximum (worst) rank 
            actualRank = n+1 
        else: 
            actualRank = actual[school]        
        rank_diff = abs(rank-actualRank)**2 
        ranks.append(rank_diff)
    mse = sum(ranks) / len(ranks)
    return (mse) 

def mseAP(n):
    predicted = get_original_AP_rankings()
    actual = get_original_MM_rankings()

    ranks = []
    for i in range(n):
        rank = i+1 
        (school) = predicted[rank]
        school = (removeNCAA(school))
        actualRankings = {val:key for (key, val) in actual.items()}
        if school not in actual: 
            #since the school is not in the top 25 March Madness teams after the tournament 
            #assign the actual rank as n+1, which is the maximum (worst) rank 
            actualRank = n+1
        else: 
            actualRank = actual[school]        
        rank_diff = abs(rank-actualRank)**2 
        ranks.append(rank_diff)
    mse = sum(ranks) / len(ranks)
    return (mse) 

In [7]:
print("The ELO's means squared error is: ", mseELO(25))
print("The AP's means squared error is: ", mseAP(25))

The ELO's means squared error is:  110.36
The AP's means squared error is:  137.72


## Analysis: ELO ## 
As we can see from above, our ELO predictions had a better MSE than the AP's predictions, thus proving that our predictions fit the results better than the AP's predictions. We do see that this number is still quite high, but we will see that using a linear regression and training on certain features will produce better results than the ELO model. 

![](elo.png)

### Model 2: Linear Regression

In practice with Machine Learning fitting a model to data is an iterative process. It takes multiple models and attempts to find a Machine Learning algorithm that achieves the desired performance. Therefore, in our project we decided to also implement a Linear Regression Model and compare the results produced with those from the ELO Rating.

As we learned in class, Linear Regression is estimating the vector of parameters Beta, from the equation

y=X*b+e. Where y is a vector of outputs, e is noise, and X is a design matrix. A design matrix consists of rows of examples where each row vector contains all the features for that example.


#### When to use Linear Regression?

Linear Regression is used when the outcome (dependent variable) is continuous and when your input features are
continuous.

#### Coefficient interpretation

In linear regression, the coefficient interpretation of independent variables (features) is as follows:
       
           --->Holding all other variables constant, with a unit increase in this variable, the dependent variable 
           is expected to increase/decrease by this much.


#### Error minimization technique

Linear regression uses ordinary least squares method to minimize the errors and arrive at a best possible fit

### Predicting March Madness Rankings through Linear Regression

**Inputs** are continious integer or float values that represent things like number of wins, losses, and free point percentage 

**Output** is a rating per team denominated as points below/above average.

In [8]:
import pandas as pd
import numpy as np
import scipy.linalg as la
import math
from collections import Counter

## Scraping Data for Linear Regression 

We will be scraping our data from sports-reference.com.

The features we identified as potentially significantly significant:

    1.) Games Played
    2.) Wins
    3.) Losses
    4.) Win-Loss %
    5.) Points
    6.) Field Goal%
    7.) Free Throw%
    8.) Offensive Rebounds
    9.) Assists
    10.) Steals
    11.) Blocks
    12.) Turnovers
    13.) Personal Fouls

For the 351 schools from the website, we must gather all of these features for each school and create an input feature matrix.

We must gather 2 sets of X data. One from 2016-2017 season data to train on and one from 2017-2018 season data to predict on.

#### 2016-2017 Training Feature Matrix

In [9]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd


def getData():
    url = "https://www.sports-reference.com/cbb/seasons/2017-school-stats.html"
    page = urlopen(url).read()
    soup = BeautifulSoup(page)
    count  = 0
    table = soup.find("tbody")
    featuresWanted =  {'school_name', 'g', 'wins', 'losses', 
                       'win_loss_pct', 'pts', 'fg_pct', 'fg3_pct', 'ft_pct', 'orb','trb', 'ast', 'stl', 'blk'
                       , 'tov', 'pf'}
    #find all rows-schools
    rows = table.findChildren(['tr'])
    pre_df={}
    for row in rows:
        if (row.find('th', {"scope":"row"}) != None):
            #iterate through all features in set
            for f in featuresWanted:
                cell = row.find("td",{"data-stat": f})
                a = cell.text.strip().encode()
                #remove any non-ascii characters
                text=a.decode("utf-8")
                #append each school's data into a dictionary of features
                if f in pre_df:
                    pre_df[f].append(text)
                else:
                    pre_df[f]=[text]
        #convert dictionary into a pandas dataframe
        df = pd.DataFrame.from_dict(pre_df)
        
    return df



def csvDump():
    df=getData()
    df.to_csv("scraped_data_lr.csv")
    
csvDump()



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


#### 2017-2018 Testing Feature Matrix

In [10]:
def getData2():
    url = "https://www.sports-reference.com/cbb/seasons/2018-school-stats.html"
    page = urlopen(url).read()
    soup = BeautifulSoup(page)
    count  = 0
    table = soup.find("tbody")
    featuresWanted =  {'school_name', 'g', 'wins', 'losses', 
                       'win_loss_pct', 'pts', 'fg_pct', 'fg3_pct', 'ft_pct', 'orb','trb', 'ast', 'stl', 'blk'
                       , 'tov', 'pf'}
    rows = table.findChildren(['tr'])
    pre_df={}
    #find all rows-schools
    for row in rows:
        if (row.find('th', {"scope":"row"}) != None):
            #iterate through all features in set
            for f in featuresWanted:
                cell = row.find("td",{"data-stat": f})
                a = cell.text.strip().encode()
                #remove any non-ascii characters
                text=a.decode("utf-8")
                #append each school's data into a dictionary of features
                if f in pre_df:
                    pre_df[f].append(text)
                else:
                    pre_df[f]=[text]
        #convert dictionary into a pandas dataframe
        df = pd.DataFrame.from_dict(pre_df)
        
    return df



def csvDump2():
    df=getData2()
    df.to_csv("scraped_data_lr_test_X.csv")
    
csvDump2()




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


### Feature Engineering and Data Validation

We also want to feature engineer our data according to the following guidelines to ensure that the data is in the correct format for the Linear Regression Model

**1.)** Linear models typically have a constant bias term. We will encode this as a column of 1s in the dataframe. Call this column 'bias'.

**2.)** We will keep most of the columns as is, since they are already numerical

**3.)** Normalize Data

**4.)** Check types of all data: 

    ast, blk, g, losses, orb, pf, pts,stl, tov, trb, wins (INTS)
   
    fg3_pct, fg_pct, ft_pct, win_loss_pct (FLOAT)
   
After scraping the data we want to ensure at the very least, that all of the schools data is present in our feature matrix. We can easily do this by checking the dimensions of our matrix.

In [11]:
def process_data():
    #CLEAN DATA
    df = pd.read_csv("scraped_data_lr.csv")
    df.drop(['Unnamed: 0'], axis = 1, inplace=True)
    #insert bias term
    df['bias']=1.0
    df.drop(['school_name'], axis = 1, inplace=True)
    return df
    
X_train=process_data()
X_train.head()

Unnamed: 0,ast,blk,fg3_pct,fg_pct,ft_pct,g,losses,orb,pf,pts,stl,tov,trb,win_loss_pct,wins,bias
0,408,83,0.376,0.464,0.663,29,16,222,631,2009,210,412,904,0.448,13,1.0
1,494,57,0.363,0.436,0.717,33,21,304,594,2411,197,390,1149,0.364,12,1.0
2,517,118,0.372,0.466,0.683,36,9,357,653,2772,198,422,1246,0.75,27,1.0
3,326,50,0.28,0.405,0.669,29,27,299,526,1794,100,384,914,0.069,2,1.0
4,503,160,0.348,0.462,0.736,33,16,321,527,2369,157,444,1188,0.515,17,1.0


In [12]:
def process_data_test_X():
    #CLEAN DATA
    df = pd.read_csv("scraped_data_lr_test_X.csv")
    df.drop(['Unnamed: 0'], axis = 1, inplace=True)
    #insert bias term
    df['bias']=1.0
    df.drop(['school_name'], axis = 1, inplace=True)
    return df
    
X_test=process_data_test_X()
X_test.head()

Unnamed: 0,ast,blk,fg3_pct,fg_pct,ft_pct,g,losses,orb,pf,pts,stl,tov,trb,win_loss_pct,wins,bias
0,482,128,0.325,0.464,0.701,32,16,305,671,2359,260,461,1093,0.5,16,1.0
1,445,78,0.331,0.419,0.734,31,19,302,557,2124,201,391,1014,0.387,12,1.0
2,417,81,0.36,0.435,0.696,32,18,300,637,2296,189,440,1066,0.438,14,1.0
3,340,49,0.303,0.397,0.647,31,28,328,506,1873,123,514,1046,0.097,3,1.0
4,560,127,0.345,0.488,0.75,33,13,334,490,2536,178,437,1259,0.606,20,1.0


In [13]:
print(X_train.shape[0])
print(X_test.shape[0])

351
351


## Scraping The Output Data

We also need to scrape the sports-reference website to get ratings to train on also test the accuracy of our model on.

**2017 Ratings** for each school, provided by the website, will be used to train our model on

**2018 Ratings** for each school, provided by the website, will be used to test the accuracy of our model on (by comparing it with the models predicted values for 2018).


#### 2017 Rankings- Training Outputs

In [14]:
def getY():
    url = "https://www.sports-reference.com/cbb/seasons/2017-school-stats.html"
    page = urlopen(url).read()
    soup = BeautifulSoup(page)
    count  = 0
    table = soup.find("tbody")
    featuresWanted =  {'school_name', 'srs'}
    rows = table.findChildren(['tr'])
    pre_df={}
    rows
    for row in rows:
        if (row.find('th', {"scope":"row"}) != None):
            for f in featuresWanted:
                cell = row.find("td",{"data-stat": f})
                a = cell.text.strip().encode()
                text=a.decode("utf-8")
                if f in pre_df:
                    pre_df[f].append(text)
                else:
                    pre_df[f]=[text]
            df = pd.DataFrame.from_dict(pre_df)
        
    return df
def csvDumpTrainY():
    df=getY()
    df.to_csv("scraped_data_lr_test.csv")
csvDumpTrainY()





 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


#### 2018 Rankings- Testing Outputs


In [15]:
def getY2():
    url = "https://www.sports-reference.com/cbb/seasons/2018-school-stats.html"
    page = urlopen(url).read()
    soup = BeautifulSoup(page)
    count  = 0
    table = soup.find("tbody")
    featuresWanted =  {'school_name', 'srs'}
    rows = table.findChildren(['tr'])
    pre_df={}
    rows
    for row in rows:
        if (row.find('th', {"scope":"row"}) != None):
            for f in featuresWanted:
                cell = row.find("td",{"data-stat": f})
                a = cell.text.strip().encode()
                text=a.decode("utf-8")
                if f in pre_df:
                    pre_df[f].append(text)
                else:
                    pre_df[f]=[text]
            
        df = pd.DataFrame.from_dict(pre_df)
        
    return df

def csvDumpTestY():
    df=getY2()
    df.to_csv("scraped_data_lr_test_Y.csv")
csvDumpTestY()




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


### Process and Validate Output Data

Let's clean up and validate the output data by removing unwanted columns and making sure all schools have entries.

In [16]:

def process_train_Y():
    #CLEAN DATA
    df = pd.read_csv("scraped_data_lr_test.csv")
    df.drop(['Unnamed: 0'], axis = 1, inplace=True)
    df.drop(['school_name'], axis = 1, inplace=True)

    return df
y_train=process_train_Y()
def process_test_Y():
    #CLEAN DATA
    df = pd.read_csv("scraped_data_lr_test_Y.csv")
    df.drop(['Unnamed: 0'], axis = 1, inplace=True)
    df.drop(['school_name'], axis = 1, inplace=True)

    return df
y_test=process_test_Y()
print(y_train.shape[0])
print(y_test.shape[0])


351
351


## Instantiating and Running the Linear Regression Model

1.) Instantiate a linear regression model using the sklearn library

2.) Fit the model to the training set (2016-2017 season data and rankings)

3.) Predict 2018 rankings using 2017-2018 season data


In [17]:
from sklearn import datasets, linear_model

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [18]:
print(y_pred)

[[ -1.09145317]
 [ -5.95802877]
 [ -8.49269767]
 [-25.34304605]
 [  9.21925457]
 [-11.4941322 ]
 [  3.52412716]
 [  6.77605817]
 [ -9.13922422]
 [-13.35458591]
 [ -5.21331213]
 [  4.42626806]
 [ 15.68032268]
 [-14.14654717]
 [ -6.84019499]
 [ -8.26361591]
 [  9.85210845]
 [ -5.49807188]
 [ 11.35167106]
 [ -2.96448602]
 [  0.41556419]
 [  7.63803276]
 [  5.18995381]
 [ -3.84241557]
 [ -4.42008136]
 [  1.23663647]
 [  2.48517896]
 [ -3.65112866]
 [ -3.51563778]
 [  3.90264421]
 [ 10.77361847]
 [ -8.51729958]
 [-16.86553095]
 [  7.09303318]
 [ 10.61424246]
 [  6.79123599]
 [ -3.58630956]
 [ -7.78699831]
 [ -0.20774513]
 [ -9.27419088]
 [  0.07102235]
 [  4.31523565]
 [-10.18860112]
 [  9.83032832]
 [-10.70378091]
 [  5.71221975]
 [  1.72465963]
 [  3.96910872]
 [ -4.99775767]
 [ -4.17465712]
 [  6.18023904]
 [ -2.27236221]
 [-15.20434276]
 [-11.56826205]
 [-18.25929224]
 [ 22.55229686]
 [-11.51438168]
 [ 12.76656157]
 [ -8.98542462]
 [ -3.9514532 ]
 [  4.18297834]
 [  9.59043855]
 [ -6.18

## How Well is Our Model Performing?

Let's compute the Mean Squared Error between the predicted rankings of all 351 schools from the LR model and
the actual 2018 rankings provided by the website.

In [19]:
def MSE(y_train,y_test):
    return (np.square(y_train-y_test)).mean()

print(MSE(y_pred, y_test))


srs    30.254876
dtype: float64


### Conclusions and March Madness

A MSE of 30.25 is comparable and even slightly better than the MSE that the ELO model produced!

Now that we have verified the significance of the features we used in our model and the performance of our model,
lets compare the rankings we produced for 2018 with actual March Madness rankings.



In the code below, we will take the top 15 ranked teams after their 2017-2018 season and compare these top 15 teams with the top 15 teams from after March Madness.

In [20]:
from itertools import *

#get a list of all schools whose indices map to their rankings
def get_all_schools():
    #CLEAN DATA
    df = pd.read_csv("scraped_data_lr_test.csv")
    
    return df['school_name']

schools=get_all_schools()
school=schools.tolist()

#Rankings produced by our Model

sub=list(chain.from_iterable(y_pred))
#zip together schools with their rankings
new=list(zip(school,sub))
#sort list based on rankings
new2=sorted(new, key=lambda x: x[1])

new3=new2[::-1]
#produce top 15 teams
new4=new3[:16]

x_pred_new= [x[0] for x in new4]
y_pred_new=[x[1] for x in new4]
final=list(zip(x_pred_new,y_pred_new))
print(final)

[('Michigan State\xa0NCAA', 27.835198061156461), ('Villanova\xa0NCAA', 23.995928255291943), ('Cincinnati\xa0NCAA', 22.552296855649431), ('Virginia\xa0NCAA', 22.150852600143878), ('Purdue\xa0NCAA', 21.314685745185486), ('Duke\xa0NCAA', 21.006966947664694), ("Saint Mary's (CA)\xa0NCAA", 19.802683153474376), ('Gonzaga\xa0NCAA', 18.290936880472472), ('Nevada\xa0NCAA', 17.735195888947914), ('Kansas\xa0NCAA', 17.722079192694849), ('North Carolina\xa0NCAA', 17.576207482948874), ('Michigan\xa0NCAA', 17.555943336738395), ('West Virginia\xa0NCAA', 17.525263082362443), ('Penn State', 15.937876721879945), ('Arizona\xa0NCAA', 15.680322679394024), ('Louisville\xa0NCAA', 14.747995986727062)]


![](lr.png)