In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

<p>Import libraries.</p>

In [2]:
class linearRegression:
    def __init__(self, learningRate=0.01, iterations=1000):
        self.learningRate = learningRate
        self.iterations = iterations
        self.weights = None
        self.bias = None

    def fit(self, features, target):
        nSamples, nFeatures = features.shape
        self.weights = np.zeros(nFeatures)
        self.bias = 0

        for _ in range(self.iterations):
            yPredicted = np.dot(features, self.weights) + self.bias
            dw = (1 / nSamples) * np.dot(features.T, (yPredicted - target))
            db = (1 / nSamples) * np.sum(yPredicted - target)
            self.weights -= self.learningRate * dw
            self.bias -= self.learningRate * db

    def predict(self, features):
        return np.dot(features, self.weights) + self.bias

<p>__init__ function: Initialize class variables.</p>
<p>fit function: fit the model, create feature matrix and target variable. Use gradient descent to determine weights and bias. Update weights and bias at bottom of loop. </p>
<p>predict function: predict values using the trained weights and bias.</p>

In [3]:
plays = pd.read_csv("data/plays.csv")

Load data

In [4]:
def playType(description):
    if "pass" in description.lower():
        return "Pass"
    elif "run" in description.lower():
        return "Run"
    return "Unknown"

def side(description):
    if "right" in description.lower():
        return "Right"
    elif "left" in description.lower():
        return "Left"
    return "Center"

<p>Extract play type and side data.</p>

In [5]:
plays["playType"] = plays["playDescription"].apply(playType)
plays["playSide"] = plays["playDescription"].apply(side)

<p>Use functions to create new columns.</p>

In [6]:
plays["yardsGained"] = plays["expectedPointsAdded"]

<p>Yards gained is target variable.</p>

In [7]:
filterPlays = plays[
   (plays["playType"].isin(["Pass", "Run"])) &
   (plays["playSide"].isin(["Right", "Left"])) 
]

<p>Filter out Unkown and Center side plays.</p>

In [8]:
filterPlays["encodePlayType"] = filterPlays["playType"].map({"Run": 0, "Pass": 1})
filterPlays["encodeSideType"] = filterPlays["playSide"].map({"Left": 0, "Right": 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filterPlays["encodePlayType"] = filterPlays["playType"].map({"Run": 0, "Pass": 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filterPlays["encodeSideType"] = filterPlays["playSide"].map({"Left": 0, "Right": 1})


In [9]:
X = filterPlays[["encodePlayType", "encodeSideType"]].values
y = filterPlays["yardsGained"].values

<p>Create feature matrix and target variable.</p>

In [10]:
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2, random_state=42)

<p>Create training and test data splits.</p>

In [12]:
learningRates = [0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0]
iterations = 1000
results = []

for lRate in learningRates:
    linReg = linearRegression(learningRate=lRate, iterations=iterations)
    linReg.fit(XTrain, yTrain)
    yPred = linReg.predict(XTest)
    errorMeasure = np.mean((yTest - yPred) ** 2)
    results.append({"Learning Rate": lRate, "MSE": errorMeasure})
results2 = pd.DataFrame(results)
best = results2.sort_values(by="MSE", ascending=True).iloc[0]

print("Best Learning Rate: ", best["Learning Rate"])
print("Lowest MSE: ", best["MSE"])

Best Learning Rate:  0.5
Lowest MSE:  1.7774829296306685


<p>Performs a parameter sweep over different learning rates for a linear regression to find which is best. Predictions are made on XTest and MSE is calculatd. Results are stored in a dataframe and the learning rate with lowest MSE is printed</p>