In [22]:
# Import libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

In [23]:
# Read in data and display
data = Path('Data/model_data2.csv')
df = pd.read_csv(data)
df.tail()

Unnamed: 0,Open,Close,Diff.,Diff. Dec.,Diff. %,Result
2457,-115,-117,-2,0.017391,1.73913,1
2458,-115,-120,-5,0.043478,4.347826,1
2459,-105,-104,1,-0.009524,0.952381,0
2460,-115,-121,-6,0.052174,5.217391,1
2461,-120,-124,-4,0.033333,3.333333,0


In [24]:
# Drop unnecessary columns
new_df = df.drop(columns=["Open", "Close", "Diff.", "Diff. Dec."])
new_df.tail()

Unnamed: 0,Diff. %,Result
2457,1.73913,1
2458,4.347826,1
2459,0.952381,0
2460,5.217391,1
2461,3.333333,0


In [25]:
# Create a scatter plot with the wins data
win_plot = new_df.hvplot.scatter(
    x="Diff. %",
    y="Result",
    title="Public Wins"
)
win_plot

In [26]:
# Create the X set by using the reshape funciton to format the wins data as a single column array
# and display
X = new_df["Diff. %"].values.reshape(-1, 1)
X[:5]

array([[ 2.43902439],
       [20.95238095],
       [ 9.52380952],
       [ 2.38095238],
       [ 6.08695652]])

In [27]:
# Create an array for the dependent variable y with the wins data
y = new_df["Result"]

In [28]:
# Create a model with scikit-learn
model = LinearRegression()

In [29]:
# Fit the data into the model
model.fit(X, y)

LinearRegression()

In [30]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [0.00272671]


In [31]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 0.4872773167187989


In [32]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 0.4872773167187989 + 0.0027267124416939406X


In [33]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [34]:
# Create a copy of the original data
win_df_predicted = new_df.copy()

# Add a column with the predicted wins values
win_df_predicted["win_predicted"] = predicted_y_values

# Display
win_df_predicted

Unnamed: 0,Diff. %,Result,win_predicted
0,2.439024,0,0.493928
1,20.952381,1,0.544408
2,9.523810,1,0.513246
3,2.380952,0,0.493769
4,6.086957,0,0.503875
...,...,...,...
2457,1.739130,1,0.492019
2458,4.347826,1,0.499133
2459,0.952381,0,0.489874
2460,5.217391,1,0.501504


In [35]:
# Create a line plot of the predicted win values
best_fit_line = win_df_predicted.hvplot.line(
    x = "Diff. %",
    y = "win_predicted",
    color = "red"
)
best_fit_line

In [36]:
# Superimpose the original data and the best fit line
win_plot * best_fit_line

In [43]:
# Display the formula to predict the wins with 20% diff.
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 20")

# Predict the wins with 20% diff.
y_20 = model.intercept_ + model.coef_[0] * 20

# Display the prediction
print(f"Predicted win with 20% diff.: {y_20:.2f}")

Model's formula: y = 0.4872773167187989 + 0.0027267124416939406 * 20
Predicted win with 20% diff.: 0.54


In [38]:
# Create an array to predict wins for 10, 15, 20, 25% diff.
X_diff = np.array([10, 15, 20, 25])

# Format the array as a one-column array
X_diff = X_diff.reshape(-1, 1)

# Display the data
X_diff

array([[10],
       [15],
       [20],
       [25]])

In [39]:
# Predict the wins for 10, 15, 20, 25% diff.
predicted_wins = model.predict(X_diff)

In [40]:
# Create a DataFrame for the predicted wins
df_predicted_wins = pd.DataFrame(
    {
        "diff": X_diff.reshape(1,-1)[0],
        "predicted_wins": predicted_wins
    }
)

# Display the data
df_predicted_wins

Unnamed: 0,diff,predicted_wins
0,10,0.514544
1,15,0.528178
2,20,0.541812
3,25,0.555445


In [41]:
# Import metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [42]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print metrics
print(f"The score is {score}")
print(f"The r2 is {r2}")
print(f"The mean squared error is {mse}")
print(f"The root mean squared error is {rmse}")
print(f"The standard deviation is {std}")

The score is 0.0014479158323682517
The r2 is 0.0014479158323682517
The mean squared error is 0.2496101802663066
The root mean squared error is 0.4996100281882927
The standard deviation is 0.4999721180774431
