In [1]:
#pragma cling add_include_path("/home/patrick/march_madness_jupyter/include_files/SimLib/")
#pragma cling add_include_path("/home/patrick/march_madness_jupyter/include_files/eigen/")

#include "xtensor-blas/xlinalg.hpp"

#include <iostream>
#include <fstream>
#include <map>
#include "Eigen/Core"
#include "Eigen/Dense"
#include <algorithm>
#include <iterator>
#include "Simulator.h"
#include "Game.h"
#include "Team.h"

#include <chrono>
using namespace std::chrono;

In [2]:
using namespace std;
using namespace Eigen;

vector<string> labels;
vector<Team*> teamCollection;
vector<Team> resultVector;
vector<int> x_axis;
vector<int> y_axis;
map<int, Team*> teamMap;
Matrix<double, Dynamic, Dynamic> gameMatrix;
VectorXd scores;
VectorXd solutionVector;
vector<int> correct;
int numGamesPlayed;
int numTeams;
const int HIGH_MARGIN = 21;
const double HIGH_MARGIN_SCALE = .8;
double pSum;

In [3]:
Team::Team(string name, int id)
{
    this->id = id;
    this->name = name;
}

In [4]:
void Team::addGame(Game* newGame)
{
    this->gameCollection.push_back(newGame);
}

In [5]:
std::vector<Game*> Team::getAllGames()
{
    return this->gameCollection;
}

In [6]:
int Team::getId()
{
    return this->id;
}

In [7]:
string Team::getName()
{
    return this->name;
}

In [8]:
int Team::getNumGamesPlayed()
{
    return this->Team::gameCollection.size();
}

In [9]:
string Team::toString()
{
    return name;
}

In [10]:
void Team::addRank(double rank)
{
    this->rank = rank;
}

In [11]:
double Team::getRank() const 
{
    return this->rank;
}

In [12]:
void printVector(std::vector<int> vect)
{
    for(int i = 0; i < vect.size(); i++)
{
    cout << vect.at(i) << '\n';
}
}

In [13]:
void printVector(std::vector<Team> vect)
{
    for(int i = vect.size()-1; i >= vect.size()-64; i--)
{
    cout << vect.at(i).getId() << '\n';
}
}

In [14]:
#define LESS_THAN operator< //workaround for bug in cling
bool LESS_THAN(const Team &team1, const Team &team2)
{
    return team1.getRank() < team2.getRank();
}
#undef LESS_THAN

In [15]:
Game::Game(int day, Team* team1, Team* team2, int score1, int score2)
{
    this->day = day;
    this->teams.push_back(team1);
    this->teams.push_back(team2);
    this->score1 = score1;
    this->score2 = score2;
}

In [16]:
void createTeams(string teamData) {
    string id, name;
    ifstream teams (teamData);
    if(teams.is_open()){
        while (getline(teams, id, ',')){
            getline(teams, name, '\n');
            int idNum = stoi(id);
            Team* tempTeam = new Team(name, idNum);
            teamCollection.push_back(tempTeam);
            teamMap.insert(pair<int, Team*>(idNum, tempTeam));
            numTeams++;
        }
        teams.close();
    }
    else cout << "Unable to open file";
}

In [17]:
void loadGames(string gameData, int home_field_advantage, bool apply_scaling) {
    gameMatrix = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>::Zero(numTeams , numTeams);
    scores = Eigen::VectorXd::Zero(numTeams);

    string days, date, team1, field1, score1, team2, field2, score2;
    fstream games(gameData);
    if(games.is_open()){
        while(getline(games, days, ',')){
            getline(games, date, ',');
            getline(games, team1, ',');
            getline(games, field1, ',');
            getline(games, score1, ',');
            getline(games, team2, ',');
            getline(games, field2, ',');
            getline(games, score2, ' ');
            getline(games, score2, ' ');

            int daysInt = stoi(days);
            int team_1_Id = stoi(team1);
            int team_2_Id = stoi(team2);
            int team_1_score = stoi(score1);
            int team_2_score = stoi(score2);
            Game tempGame(daysInt, teamMap.at(team_1_Id), teamMap.at(team_2_Id), team_1_score, team_2_score);
            teamMap.at(team_1_Id)->addGame(&tempGame);
            teamMap.at(team_2_Id)->addGame(&tempGame);
            numGamesPlayed++;

            /*E
             * Edit game score based on flag.
             * flag = 1: 4 is added to home team score
             */
            if(stoi(field1) == 1){
                team_1_score += home_field_advantage;
            }
            if(stoi(field2) == 1){
                team_2_score += home_field_advantage;
            }

            populateMatrix(team_1_Id, team_2_Id, team_1_score, team_2_score, apply_scaling);
            for(int i = 0; i < numTeams; i++){
                gameMatrix.row(numTeams - 1).col(i) << 1;
            }
            scores.row(numTeams - 1) << 0;
        }
    }
    else cout << "Unable to open file";
}

In [18]:
void populateMatrix(int team1Index, int team2Index, int team_1_score, int team_2_score, bool apply_scaling) {
    gameMatrix.row(team1Index - 1 ).col(team2Index - 1) << -1;
    gameMatrix.row(team2Index -1 ).col(team1Index -1) <<  -1;
    gameMatrix.row(team1Index -1 ).col(team1Index -1) <<  teamMap.at(team1Index)->getNumGamesPlayed();
    gameMatrix.row(team2Index -1 ).col(team2Index -1) <<  teamMap.at(team2Index)->getNumGamesPlayed();
    //implementation of scaling if predicted margin is greater then 21
    if(abs(team_1_score - team_2_score) < HIGH_MARGIN || apply_scaling == false) {
        if (team_1_score > team_2_score) {
            int temp = scores.row(team1Index - 1).value();
            scores.row(team1Index - 1) << temp + (team_1_score - team_2_score);
            temp = scores.row(team2Index - 1).value();
            scores.row(team2Index - 1) << temp + (team_2_score - team_1_score);
        } else {
            int temp = scores.row(team2Index - 1).value();
            scores.row(team2Index - 1) << temp + (team_2_score - team_1_score);
            temp = scores.row(team1Index - 1).value();
            scores.row(team1Index - 1) << temp + (team_1_score - team_2_score);
        }
    }
    else{

        if(team_1_score > team_2_score)
        {
            int pre_margin = team_1_score - team_2_score;
            int post_margin = HIGH_MARGIN + (1/HIGH_MARGIN_SCALE) * (pow((pre_margin - 20),HIGH_MARGIN_SCALE)-1);
            int temp = scores.row(team1Index - 1).value();
            scores.row(team1Index - 1) << temp + post_margin;
            temp = scores.row(team2Index - 1).value();
            scores.row(team2Index - 1) << temp - post_margin;
        } else{
            int pre_margin = team_2_score - team_1_score;
            int post_margin = HIGH_MARGIN + (1/HIGH_MARGIN_SCALE) * (pow((pre_margin - 20),HIGH_MARGIN_SCALE)-1);
            int temp = scores.row(team2Index - 1).value();
            scores.row(team2Index - 1) << temp + post_margin;
            temp = scores.row(team1Index - 1).value();
            scores.row(team1Index - 1) << temp - post_margin;
        }

    }
}

In [19]:
void constructCorrectVector(string filename)
{
string id;
fstream rankings(filename);
    if(rankings.is_open()){
        while(getline(rankings,id,'\n')){
            int id_num = stoi(id);
            correct.push_back(id_num);
        }
    }
}

In [20]:
void createLabels()
{
for(int i = resultVector.size()-1; i >= 288; i--)
{
    labels.push_back(resultVector.at(i).getName());
}
}

In [21]:
void constructAxes()
{
int count = 0;
double sum = 0;
for(int i = 1; i < 65; i++)
{
    x_axis.push_back(i);
}
for(int i = resultVector.size()-1; i >= 0; i--)
{
    int current_value = resultVector.at(i).getId();
    for(int j = 1; j<65; j++)
    {
        if(current_value == correct.at(j-1))
        {
            y_axis.push_back(j);
            sum += pow(j - x_axis.at(count),2);
            count += 1;
        }
    }
}
    pSum = sum;
}

In [22]:
double sse()
{
    return pSum;
}

In [23]:
double mse()
{
    return pSum/64.0;
}

In [24]:
double se()
{
    return pow((pSum/64.0), .5);
}

In [25]:
void run(int home_field_advantage, bool apply_scaling){
    createTeams(string("/home/patrick/march_madness_jupyter/Data/NCAABasketballTeams.txt"));
    loadGames(string("/home/patrick/march_madness_jupyter/Data/NCAABasketballGames.txt"), home_field_advantage, apply_scaling);
    solutionVector = gameMatrix.lu().solve(scores);

    map<int, Team*>::iterator itr;
    for(itr = teamMap.begin(); itr != teamMap.end(); ++itr){
        itr->second->addRank(solutionVector.row(itr->second->getId() -1).value());
        resultVector.push_back(*(itr->second));
    }

     sort(resultVector.begin(), resultVector.end());
     vector<Team>::iterator itr2;
     for(itr2 = resultVector.end() - 1; itr2 != resultVector.begin() - 1 ; --itr2){
        cout << itr2->getId() << itr2->getName() << endl;
     }
}

# Parameters
## Home Field Advantage
Home field advandtage is used to account for the fact that the home team is generally at an advantage over the away team. For example, if Duke beat Gonzaga by 2 points and Gonzaga was the home team, a home field advantage of 4 would cause Gonzaga to take the win by 2 points.
## Scale Large Margins
Scale large margins is used to account for large score margins negatively affecting rankings. When a highly ranked team defeats a low ranked team, the higher ranked team will usually put in back up players and play conservatively, so even if they win, their rating will almost always go down, and the opposite is true for the lower ranked team. To compensate for this, games with win margins >= 21 will be scaled down to have less effect on the rankings.


In [26]:
#include "xwidgets/xslider.hpp"
xw::slider<int> slider;
slider.min = 0;
slider.max = 15;
slider.value = 0;
slider.description = "Home Field Advantage";
slider.display();

#include "xwidgets/xcheckbox.hpp"
xw::checkbox checkbox;
checkbox.value = false;
checkbox.description = "Scale Large Margins?";
checkbox.indent = false;
checkbox.display();

A Jupyter widget

A Jupyter widget

## Results

In [27]:
int home_field_advantage = slider.value;
bool apply_scaling = checkbox.value;
auto start = high_resolution_clock::now();
run(home_field_advantage, apply_scaling);
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
std::cout << duration.count() << std::endl;

104 Gonzaga
75 Duke
329 Virginia
205 North Carolina
295 Texas Tech
169 Michigan St
114 Houston
137 Kentucky
289 Tennessee
330 Virginia Tech
17 Auburn
236 Purdue
168 Michigan
353 Youngstown St
195 Nevada
32 Buffalo
348 Wofford
92 Florida St
153 LSU
198 New Mexico St
149 Louisville
320 Utah St
326 VCU
184 Murray St
192 NC State
172 Mississippi St
127 Iowa St
133 Kansas
143 Lipscomb
346 Wisconsin
21 Belmont
47 Cincinnati
222 Oregon
158 Marquette
308 UCF
49 Clemson
279 St Mary's CA
160 Maryland
327 Vermont
328 Villanova
246 S Dakota St
292 Texas
285 Syracuse
89 Florida
337 Washington
67 Dayton
134 Kansas St
95 Furman
112 Hofstra
164 Memphis
218 Oklahoma
94 Fresno St
126 Iowa
305 UC Irvine
297 Toledo
142 Liberty
217 Ohio St
253 San Francisco
171 Mississippi
20 Baylor
194 Nebraska
188 N Kentucky
60 Creighton
287 TCU
123 Indiana
14 Arkansas
54 Colorado
352 Yale
11 Arizona St
177 Montana
66 Davidson
226 Penn St
83 ETSU
209 Northeastern
349 Wright St
288 Temple
166 Miami FL
321 Utah Valley
343 

In [28]:
#include "xplot/xfigure.hpp"
#include "xplot/xmarks.hpp"
#include "xplot/xaxes.hpp"

In [29]:
constructCorrectVector("/home/patrick/march_madness_jupyter/Data/Correct.txt");
createLabels();
constructAxes();
double SSE = sse();
double MSE = mse();
double SE = se();

$$SSE = \sum \limits _{i=1} ^m(Y_i-\hat{Y_i})^2$$

In [30]:
xpl::linear_scale sx, sy;
xpl::lines line(sx, sy);
line.x = y_axis;
line.y = y_axis;
auto ax_x = xpl::axis::initialize(sx)
    .label("predicted")
    .finalize();
auto ax_y = xpl::axis::initialize(sy)
    .label("actual")
    .orientation("vertical")
    .side("left")
    .finalize();

In [31]:
#include "xplot/xtooltip.hpp"
xpl::tooltip def_tt,test;
def_tt.fields = std::vector<xtl::xoptional<std::string>>{"x","y"};
def_tt.labels = std::vector<xtl::xoptional<std::string>>{"Predicted Ranking","Actual Ranking"};

In [32]:
auto scatter1 = xpl::scatter::initialize(sx, sy)
   .x(x_axis)
   .y(y_axis)
   .unhovered_style(::xeus::xjson::parse(R"({"opacity": "0.5"})"))
   .tooltip(def_tt)
   .finalize();
scatter1.names = labels;

In [33]:
auto fig1 = xpl::figure::initialize()
    .padding_x(0.1)
    .padding_y(0.025)
    .finalize();
fig1.add_mark(scatter1);
fig1.add_axis(ax_x);
fig1.add_axis(ax_y);
fig1.add_mark(line);

# Actual vs. Predicted
This plot displays the top 64 teams with any particular teams x value representing their predicted ranking and y value representing their actual rankikng. A team(point) close to the line represents a good/accurate prediction while teams far away from the line represent inaccurate predictions. You can hover over a point on the plot and it will tell you the actual ranking and the ranking the model predicted.

In [34]:
fig1

A Jupyter widget

## Sum of Squared Errors
The method used to predict the ranking of the teams uses a least squares linear regression. For any team, we want to express the margin of victory as a linear function of the teams who played that game (Massey, 1997). Each equation will have an error term, which is the actual value - the predicted value. The best model will minimize the sum of squared error terms. This is a metric we can use to determine how well our predicted points fit the curve of the actual points. The lower the SSE, the better our model does. This is the equation:

$$SSE = \sum \limits _{i=1} ^n(y_i-\hat{y_i})^2$$

Where y is the actual ranking and $\hat{y}$ is the predicted ranking.

In [35]:
cout << "SSE = " << SSE << "\n" << "With Home Field Advantage: " << home_field_advantage
<< "\n" << "And Margin of Victory: " << std::boolalpha << apply_scaling << endl;

SSE = 17658
With Home Field Advantage: 0
And Margin of Victory: false


## Mean Squared Error
A more interpretable metric for a regression is the mean squared error. It is essentially the same as SSE, but gives us an average squared error for any point. The benefit to this is that the metric can go *down* as the number of points goes up. This is better because with SSE as you add points, the number will always go up, even if you add points that fit the model well. This is the equation for Mean Squared Errors:

$$MSE = \frac{1}{n} \sum \limits _{i=1} ^n(y_i-\hat{y_i})^2$$

In [36]:
cout << "MSE = " << MSE << "\n" << "With Home Field Advantage: " << home_field_advantage
<< "\n" << "And Margin of Victory: " << std::boolalpha << apply_scaling << endl;

MSE = 275.906
With Home Field Advantage: 0
And Margin of Victory: false


## Standard Error
Perhaps the most readable metric for a regression is the standard error. There are many different derivations of this equations, but the most simple one is: 

$$\sigma = \sqrt{\frac{\sum \limits _{i=1} ^n(y_i-\hat{y_i})^2}{n}}$$

This gives us the *standard deviation of the errors of predections*

In [37]:
cout << "SE = " << SE << "\n" << "With Home Field Advantage: " << home_field_advantage
<< "\n" << "And Margin of Victory: " << std::boolalpha << apply_scaling << endl;

SE = 16.6104
With Home Field Advantage: 0
And Margin of Victory: false
