# How to build a logistic regression model

### 0. Import required modules

In [27]:
import pandas as pd #import pandas library for data manipulation (https://pandas.pydata.org/)
import numpy as np #import the 'NumPy' module for scientific computing with Python (https://numpy.org/)

In [28]:
%run -i 'SportsAnalytics.py' #source required functions for building a logistic regression and plotting the results

### 1. Create generic dataset

In [29]:
#create multiple independent variable lists
var1 = [2.7, 0.4, 2.4, 0.4, -0.6, -1.6, -0.7, -3.9, 0.7, 3.2, -1.0, 1.5, -2.1, 0.0, -6.2, 2.1, -2.1, 2.7]
var2 = [-1.2, -4.0, None, 0.6, -3.0, -4.2, -4.3, -4.4, -4.3, -3.8, -4.3, -4.2, 1.8, -0.5, -4.3, -4.1, -3.4, 0.9] # 
var3 = [8.0, 13.0, None, 2.3, 15.5, 4.2, 11.5, -3.0, 1.6, -6.3, 7.0, -5.8, -8.7, -2.3, 14.6, 14.3, 16.1, -10.5] #

#create singular dependent variable lists
y = [0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0]

#merge independent and dependent variables into dataframe
data = pd.DataFrame({
    'var1': var1, 
    'var2': var2, 
    'var3': var3,
    'y': y})

data #display dataframe

Unnamed: 0,var1,var2,var3,y
0,2.7,-1.2,8.0,0
1,0.4,-4.0,13.0,0
2,2.4,,,1
3,0.4,0.6,2.3,0
4,-0.6,-3.0,15.5,1
5,-1.6,-4.2,4.2,1
6,-0.7,-4.3,11.5,1
7,-3.9,-4.4,-3.0,1
8,0.7,-4.3,1.6,1
9,3.2,-3.8,-6.3,1


### 2. Fitting a logistic regression

Logistic regression aims to solve classification problems.   
It does this by predicting categorical outcomes, unlike linear regression that predicts a continuous outcome.  

In the simplest case there are two outcomes, which is called binomial.  
Other cases have more than two outcomes to classify, in this case it is called multinomial.   

Here we will be using basic logistic regression to predict a binomial variable.   
This means that the actual result of the dependent variable (y) only two possible outcomes (0, 1).  

With a trained logistic regression model we are able to predict a continuous result of the dependent variable.

In [30]:
#--- use the LogisticRegTrain() function from 'SportsAnalytics.py' to build a logistic regression model
model = LogisticRegTrain(X = data[['var1', 'var2', 'var3']], Y = data['y'])

Optimization terminated successfully.
         Current function value: 0.362319
         Iterations 7
                   const   var1    var2    var3
y                                              
Coefficients     -2.0467 0.3006 -1.4322 -0.2072
Std error         1.2998 0.3217  0.6925  0.1413
p-value           0.1153 0.3502  0.0386  0.1427
Log-likelihood   -6.1594                       
Number valid obs 13.0000                       
Total obs        17.0000                       


### 3. Predicting results using logistic regression model

In [31]:
#--- use the LogisticRegPredict() function from 'SportsAnalytics.py' to predict values of y with known values of x
model_predict = LogisticRegPredict(model, X = data[['var1', 'var2', 'var3']])
model_predict

Unnamed: 0,var1,var2,var3,prediction
0,2.7,-1.2,8.0,0.24
1,0.4,-4.0,13.0,0.75
2,2.4,,,
3,0.4,0.6,2.3,0.04
4,-0.6,-3.0,15.5,0.24
5,-1.6,-4.2,4.2,0.93
6,-0.7,-4.3,11.5,0.82
7,-3.9,-4.4,-3.0,0.98
8,0.7,-4.3,1.6,0.98
9,3.2,-3.8,-6.3,1.0


### 4. Predicting out-of-sample (OOS) results using logistic regression model

In [32]:
#create multiple independent variable lists
oos_var1 = [0.7, -4.8, -3.4, -0.3, 4.5, 2.6]
oos_var2 = [-3.4, -1.9, -0.3, -3.2, -2.4, -0.6]
oos_var3 = [2.3, 11.0, 9.8, 2.5, 3.9, 10.3]

#merge independent variables into dataframe
oos_data = pd.DataFrame({
    'var1': oos_var1, 
    'var2': oos_var2, 
    'var3': oos_var3})

#predict values of y with new out-of-sample values of x
oos_model_predict = LogisticRegPredict(model, X = oos_data[['var1', 'var2', 'var3']])

oos_model_predict #display dataframe

Unnamed: 0,var1,var2,var3,prediction
0,0.7,-3.4,2.3,0.93
1,-4.8,-1.9,11.0,0.05
2,-3.4,-0.3,9.8,0.01
3,-0.3,-3.2,2.5,0.87
4,4.5,-2.4,3.9,0.87
5,2.6,-0.6,10.3,0.07
