In [30]:
# Shapley Feature Explanations 

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv("TSLA_weekly_return_volatility.csv")
df_18 = df.loc[df["Year"] == 2018].reset_index()
df_19 = df.loc[df["Year"] == 2019].reset_index()

# For all feature
# 2018 for training
X_train = df_18[["mean_return", "volatility"]].values
Y_train = df_18["label"].values
# Y_train = LabelEncoder().fit_transform(Y_train)

# 2019 for testing
X_test = df_19[["mean_return", "volatility"]].values
Y_test = df_19["label"].values
# Y_test = LabelEncoder().fit_transform(Y_test)

# Use mean to train
x_train_mean = df_18[["mean_return"]].values
x_test_mean = df_19[["mean_return"]].values

# Use volatility to train
x_train_volatility = df_18[["volatility"]].values
x_test_volatility = df_19[["volatility"]].values

# Xtrain/ Xtest: reshape single column to (-1,1)

### Compute the contributions of mean and volatility for Logistic regression

In [31]:
def log_reg_pred(x_train, y_train, x_test):
    log_reg_classifier = LogisticRegression()
    log_reg_classifier.fit(x_train, y_train)
    pred_y = log_reg_classifier.predict(x_test)
    return pred_y

# All features
log_pred_all = log_reg_pred(X_train ,Y_train, X_test)
log_all_accuracy = accuracy_score(Y_test, log_pred_all)

# Remove volatility, use mean for training
log_pred_1 = log_reg_pred(x_train_mean, Y_train, x_test_mean)
log_accuracy_1 = accuracy_score(Y_test, log_pred_1)

# Remove mean, use volatility for training
log_pred_2 = log_reg_pred(x_train_volatility, Y_train, x_test_volatility)
log_accuracy_2 = accuracy_score(Y_test, log_pred_2)

# Marginal contribution of each feature
log_mrgnl_contribution_mean = log_all_accuracy - log_accuracy_2 
log_mrgnl_contribution_volatility = log_all_accuracy - log_accuracy_1 

### Compute the contributions of mean and volatility for Euclidean kNN

In [32]:
def knn_pred(k, x_train, y_train, x_test):
    # scaling
    scale_X = StandardScaler().fit(x_train)
    x_train = scale_X.transform(x_train)
    x_test = scale_X.transform(x_test)
    # model
    knn_classifier = KNeighborsClassifier(n_neighbors=k, metric="euclidean")
    knn_classifier.fit(x_train, y_train)
    knn_pred_y = knn_classifier.predict(x_test)
    return knn_pred_y

# all features
knn_pred_all = knn_pred(5, X_train, Y_train, X_test)
knn_all_accuracy = accuracy_score(Y_test, knn_pred_all)

# Remove volatility, use mean for training
knn_pred_1 = knn_pred(5, x_train_mean, Y_train, x_test_mean)
knn_accuracy_1 = accuracy_score(Y_test, knn_pred_1)

# Remove mean, use volatility for training
knn_pred_2 = knn_pred(5, x_train_volatility, Y_train, x_test_volatility)
knn_accuracy_2 = accuracy_score(Y_test, knn_pred_2)

# Marginal contribution of each feature
knn_mrgnl_contribution_mean = knn_all_accuracy - knn_accuracy_2
knn_mrgnl_contribution_volatility = knn_all_accuracy - knn_accuracy_1

### Tables of findings

In [33]:
# Contributions
columns = ["All festures", "Mean for training", "Volatility for training"]
index = ["Logistic regression", "kNN"]
table = pd.DataFrame(columns=columns, index=index)
table.loc["Logistic regression"] = [log_all_accuracy, log_accuracy_1, log_accuracy_2]
table.loc["kNN"] = [knn_all_accuracy, knn_accuracy_1, knn_accuracy_2]
table.style.set_caption("Accuracy Contributions")

Unnamed: 0,All festures,Mean for training,Volatility for training
Logistic regression,0.943396,0.962264,0.54717
kNN,0.924528,0.943396,0.415094


In [35]:
# Marginal contributions
columns = ["Mean", "Volatility"]
index = ["Logistic regression", "kNN"]
table_c = pd.DataFrame(columns=columns, index=index)
table_c.loc["Logistic regression"] = [log_mrgnl_contribution_mean, log_mrgnl_contribution_volatility]
table_c.loc["kNN"] = [knn_mrgnl_contribution_mean, knn_mrgnl_contribution_volatility]
table_c.style.set_caption("Marginal Contribution of Each Feature")

Unnamed: 0,Mean,Volatility
Logistic regression,0.396226,-0.018868
kNN,0.509434,-0.018868


From the tables above, using mean only to train and test data can get a higher accuracy. The marginal contribution of mean is over 30% while the marginal contribution of volatility is -2%. In other words, when I remove volatility in training and testing, the accuracy increases by about 2%; in constrast, the accracy drops over 30% when I remove mean in training and testing.