# Personal Contribution
### Synthetic Feature
In this notebook i create a synthetic feature called 'HOT_HAND' which is a measure of the palyer's confidence.  
We are then using the feature withthe machine learning model used previously in our inference task and hope to get better results

In [10]:
import pandas as pd
import plotly.express as px

import numpy as np # linear algebra
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('fivethirtyeight')

import seaborn as sns

#Inference task imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#Logistic reg
from sklearn.linear_model import LogisticRegression

#decision tree
from sklearn.tree import DecisionTreeClassifier, export_text

#Neural net
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error


In [11]:
shots = pd.read_csv('shot_logs.csv', header=0)
print(shots.shape)

(128069, 21)


This function should return a series of the game cumulative points of the player at the moment of the shot.  
I will be using this for my feature

In [12]:
def cumulative_points(df):
    cumul = np.zeros(128069)
    for i in range(128069):
        num_shot = df.iloc[i,5]
        for j in range(num_shot):
            cumul[i] += df.iloc[i-j,17]
    return cumul

In [13]:
shots['PTS_CUMUL'] = cumulative_points(shots)

The feature

In [14]:
confidence = (shots['PTS_CUMUL']/(shots['SHOT_NUMBER']*3))
shots['HOT_HAND'] = confidence

Next is the inference task withthe feature after some data cleaning

In [15]:
#Cleaning data

df = shots

df = df.drop(['GAME_ID', 'MATCHUP', 'CLOSEST_DEFENDER_PLAYER_ID', 'FGM', 'PTS', 'player_id', 'NEXT_SHOT_RESULT'], axis=1)

df.columns = ['Location', 'W', 'Margin', 'Shot_No', 'Period', 'Game_Clock', 'Shot_Clock', 'Dribbles', 'Touch_Time', 'Shot_Dist', 'Pts_Type', 'Shot_Result', 'Closest_Defender', 'Close_Def_Dist', 'Player_Name', 'PTS_CUMUL', 'HOT_HAND']

df['Shot_Result'] = np.where(df['Shot_Result'] == 'made', 1, 0)

df = df.dropna()

df['W'] = df['W'].map({'W': 1, 'L': 0})

df['Location'] = df['Location'].map({'H': 1, 'A': 0})

df=df[df['Touch_Time']>0]

df['Game_Clock'] = df['Game_Clock'].apply(lambda x: 60*int(x.split(':')[0]) + int(x.split(':')[1]))

df['Game_Clock'] = (720-df['Game_Clock']) + (df['Period']-1)*720

df = df.drop(['Period'], axis=1)

df.to_csv("cleaned_data.csv", index=False)

KeyError: "['NEXT_SHOT_RESULT'] not found in axis"

In [None]:
# Create X and y
X = df.drop(['Margin', 'Shot_Result', 'Player_Name', 'Closest_Defender'], axis=1)
y = df['Shot_Result']

In [None]:
#Logistic Regression

# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a model and train it (Logistic Regression)
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Make predictions on the testing dataset
y_pred = lr.predict(X_test)

# Evaluate the performance of the model (Accuracy)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Obtain the formula and weights used
intercept = lr.intercept_[0]
coefficients = lr.coef_[0]
formula = f"y = {intercept} + "
for feature, coef in zip(X.columns, coefficients):
    formula += f"({coef} * {feature}) + "
formula = formula[:-3]  # Remove the last '+'
print("Formula:", formula)
print("Weights:", coefficients)

# Feature Importance
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': lr.coef_[0]})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)


In [None]:
#Decision Tree

# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a model and train it (Decision Trees)
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)


# Print the formula and weights used
tree_rules = export_text(dt, feature_names=X_train.columns.tolist())
#print(tree_rules)

# Make predictions on the testing dataset
y_pred = dt.predict(X_test)

# Evaluate the performance of the model (Accuracy)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Get feature importances
importances = dt.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False)
print(feature_importances)

# Get feature sensitivities
sensitivities = []
for feature in feature_names:
    X_test_copy = X_test.copy()
    X_test_copy[feature] = X_test_copy[feature].mean()
    y_pred = dt.predict(X_test_copy)
    sensitivity = accuracy_score(y_test, y_pred)
    sensitivities.append(sensitivity)
feature_sensitivities = pd.DataFrame({'feature': feature_names, 'sensitivity': sensitivities}).sort_values('sensitivity', ascending=False)
print(feature_sensitivities)

In [None]:
#Neural Net

# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a neural network regressor
regressor = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=100, random_state=42)

# Train the regressor on the training data and track the lowest MSE
lowest_mse = float('inf')
best_regressor = None

mse_scores = []
for i in range(1, 20):
    regressor.partial_fit(X_train_scaled, y_train)
    y_pred = regressor.predict(X_train_scaled)
    mse = mean_squared_error(y_train, y_pred)
    mse_scores.append(mse)
    
    if mse < lowest_mse:
        lowest_mse = mse
        best_regressor = regressor

# Make predictions on the testing data using the best regressor
y_pred_test = best_regressor.predict(X_test_scaled)
mse_test = mean_squared_error(y_test, y_pred_test)

# Plot the MSE scores over iterations
plt.plot(range(1, 20), mse_scores)
plt.xlabel('Iterations')
plt.ylabel('Mean Squared Error')
plt.title('MSE Scores over Iterations')
plt.show()

# Print the lowest MSE and the best regressor's features
print('Lowest MSE:', lowest_mse)
print('Features:', X.columns)
print('Best Regressor:', best_regressor)

The numbers were all better than the initial ones. Values of previous work and comparison are in presentation