In [279]:
import numpy as np
import matplotlib.pyplot as plt

shape = (100, 16)
n = shape[1]

np.random.seed(10)
X = np.random.randint(-5,6, size=shape) # Integers from -5 to 5
y = np.sum(X, axis=1) # Sum of each row
X = X + np.random.normal(0, .1, shape) # Add noise to input


0.015690186207032074
0.047576927853119115


In [236]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X, y)
reg.score(X, y), reg.coef_

(0.9993316174977624,
 array([0.98231329, 1.00293103, 1.01089872, 0.99780244, 1.01754801,
        1.01399901, 1.01117047, 1.0116978 , 0.97996713, 1.0128416 ,
        1.00504136, 1.01933667, 0.98243754, 0.98514289, 1.0177332 ,
        1.00559587]))

In [237]:
import itertools
import math
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans

def divisive_shap_approx(X, y, model, beta, gamma_n, gamma_d):
    if X.shape[1] > np.emath.logn(beta, n):
        split_S = partition(X,y,model)
        gamma_n = gamma_n*value(X, y, model)
        gamma_d = gamma_d*sum([value(Xi, yi, model) for Xi, yi in split_S])
        return union([divisive_shap_approx(Xi, 
                                           yi, 
                                           model,
                                           beta,
                                           gamma_n, 
                                           gamma_d) for Xi, yi in split_S])
    else:
        shap = shapley_true(X, y, model)
        shap = shap*gamma_n/gamma_d
        return (X, y, shap)
def divisive_shap_approx_knn(X, y, model, beta, gamma_n, gamma_d,k):
    if X.shape[1] > np.emath.logn(beta, n):
        split_S = partition_knn(X,y,model,k)
        gamma_n = gamma_n*value(X, y, model)
        gamma_d = gamma_d*sum([value(Xi, yi, model) for Xi, yi in split_S])
        return union([divisive_shap_approx_knn(Xi, 
                                           yi, 
                                           model,
                                           beta,
                                           gamma_n, 
                                           gamma_d,k) for Xi, yi in split_S])
    else:
        shap = shapley_true(X, y, model)
        shap = shap*gamma_n/gamma_d
        return (X, y, shap)

def value(X, y, model):
    if X.size == 0:
        return 0
    else:
        reg = model.fit(X, y)
        return reg.score(X, y)
        

def findsubsets(s, n):
    return list(itertools.combinations(s, n))


def shapley_true(X, y, model):
    """Returns true shapley value of each feature (numpy array)"""
    N = X.shape[1]
    shap = np.zeros(N)
    for i in range(N):
        other_features = set(range(N))
        other_features.remove(i)
        subsets = [subset for j in range(N) for subset in findsubsets(other_features, j)]
        for subset in subsets:
            coeff = math.factorial(len(subset))*math.factorial(N - len(subset) - 1)/math.factorial(N)
            shap[i] += coeff*(value(X[:,list(subset) + [i]], y, model)-value(X[:, list(subset)], y, model))
    return np.array(shap)

def partition(X, y, model):
    """Returns list of tuples: [(X1, Y1), (X2, Y2), ...]"""
    N = X.shape[1]
    return [(X[:,:N//2], y), (X[:,N//2:], y)]
def partition_knn(X,y,model,k):

    transposed_X = X.T

    kmeans = KMeans(
        init="random",
        n_clusters=k,
        n_init=10,
        max_iter=300,
        random_state=42
    )
    kmeans.fit(transposed_X)
    labels = kmeans.labels_
    X = transposed_X.T
    partitions = []
    X = pd.DataFrame(X)
    for i in range(k):
        partitions.append((np.array(X.iloc[:,labels == i]),y))
      
    return partitions     
def partition_knn2(X,y,model,k):

    transposed_X = X.T

    kmeans = KMeans(
        init="random",
        n_clusters=k,
        n_init=10,
        max_iter=300,
        random_state=42
    )
    kmeans.fit(transposed_X)
    labels = kmeans.labels_
    X = transposed_X.T
    partitions = []
    X = pd.DataFrame(X)
    for i in range(k):
        partitions.append((np.array(X.iloc[:,labels == i]),y))
      
    return partitions 
def union(S_split):
    """Returns the union of several sets (Xi, Yi, Shapi)"""
    X = S_split[0][0]
    y = S_split[0][1]
    shap = S_split[0][2]
    for Xi, _, shapi in S_split[1:]:
        X = np.concatenate((X, Xi), axis=1)
        shap = np.concatenate((shap, shapi))
    return (X, y, shap)


In [238]:
def main(X, y, model, beta):
    X2, y2, shap = divisive_shap_approx(X, y, model, beta, 1, 1)
    w = sum(shap)
    vN = value(X, y, model)
    if w != vN:
        if w == 0:
            shap = np.array([vN/X.shape[1]]*X.shape[1])
        gamma = vN/w
        shap = shap*gamma
    return (X2, y2, shap)
def main2(X, y, model, beta,k):
    X2, y2, shap = divisive_shap_approx_knn(X, y, model, beta, 1, 1,k)
    w = sum(shap)
    vN = value(X, y, model)
    if w != vN:
        if w == 0:
            shap = np.array([vN/X.shape[1]]*X.shape[1])
        gamma = vN/w
        shap = shap*gamma
    return (X2, y2, shap)

In [239]:
shape = (10000, 64)
n = shape[1]

np.random.seed(10)
X = np.random.randint(-5,6, size=shape) # Integers from -5 to 5
y = np.sum(X, axis=1) # Sum of each row
X = X + np.random.normal(0, .1, shape) # Add noise to input

X, y, shap = main(X, y, LinearRegression(), n**(1/np.sqrt(n)))

plt.hist(shap)
plt.show()


KeyboardInterrupt: 

In [None]:
shape = (10000, 64)
n = shape[1]

np.random.seed(10)
X = np.random.randint(-5,6, size=shape) # Integers from -5 to 5
X[:,0] = X[:,0]*10
y = np.sum(X, axis=1) # Sum of each row
X = X + np.random.normal(0, .1, shape) # Add noise to input

X, y, shap = main(X, y, LinearRegression(), n**(1/np.sqrt(n)))

plt.hist(shap)
plt.show()

In [None]:
shapley_true(X, y, LinearRegression())

In [199]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('FIFA 2018 Statistics.csv')
y = (df['Man of the Match'] == "Yes")  # Convert from string "Yes"/"No" to binary
y = [1 if label == True else 0 for label in y]
feature_names = [i for i in df.columns if df[i].dtype in [np.int64, np.int64]]
X = df[feature_names]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
X

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,Pass Accuracy %,Passes,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,Goals in PSO
0,5,40,13,7,3,3,6,3,11,0,78,306,118,22,0,0,0,0
1,0,60,6,0,3,3,2,1,25,2,86,511,105,10,0,0,0,0
2,0,43,8,3,3,2,0,1,7,3,78,395,112,12,2,0,0,0
3,1,57,14,4,6,4,5,1,13,3,86,589,111,6,0,0,0,0
4,0,64,13,3,6,4,5,0,14,2,86,433,101,22,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,1,46,11,1,6,4,4,3,24,5,79,479,148,14,1,0,0,0
124,2,43,12,4,3,5,4,1,5,5,88,510,108,11,1,0,0,0
125,0,57,15,5,7,3,5,0,12,2,92,698,110,5,2,0,0,0
126,4,39,8,6,1,1,2,1,14,1,75,271,99,14,2,0,0,0


In [54]:
import csv
import numpy as np
madelon_train_labels = './madelon_train.labels'
with open(madelon_train_labels, newline='') as csvfile:
    reader = csv.reader(csvfile)
    labels = np.array(list(reader)).flatten()
labels = [0 if label == "-1" else 1 for label in labels]
len(labels)

2000

In [97]:
n = X.shape[1]
print("n:"+ str(n))

n:18


In [188]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(train_X, train_y)
reg.score(train_X, train_y), reg.coef_

(0.3378130885580428,
 array([ 1.76356991e-01,  1.20888667e-03, -5.02994518e-03,  2.03885167e-02,
         9.69129520e-03, -3.51097571e-02,  4.72775816e-02, -2.57022087e-02,
         1.07743030e-02,  1.45169901e-02,  1.88486335e-03, -1.13927992e-04,
        -6.44581519e-03, -1.07653172e-04, -4.59098258e-02, -2.78698857e-01,
        -3.12701995e-01,  1.33264856e-01]))

In [105]:
shapley_true(np.array(train_X), np.array(train_y), reg)

array([0.16999648, 0.00351305, 0.01231289, 0.03640622, 0.0052994 ,
       0.02015739, 0.01889732, 0.0034261 , 0.00753081, 0.00282818,
       0.00419999, 0.00374287, 0.00380925, 0.00176043, 0.01024351,
       0.00525661, 0.01237468, 0.01605793])

In [217]:
shap_true= [0.16999648, 0.00351305, 0.01231289, 0.03640622, 0.0052994 ,
       0.02015739, 0.01889732, 0.0034261 , 0.00753081, 0.00282818,
       0.00419999, 0.00374287, 0.00380925, 0.00176043, 0.01024351,
       0.00525661, 0.01237468, 0.01605793]


In [218]:
import numpy as np
from sklearn.metrics import mean_squared_error

X2, y2, shap = main(np.array(train_X), np.array(train_y), reg, n**(1/np.sqrt(n)))
# Compute the RMSE
mse = mean_squared_error(shap_true, shap)
rmse = np.sqrt(mse)
print(rmse)

0.006346472681021238


In [234]:
X2, y2, shap_knn = main2(np.array(train_X), np.array(train_y), reg, n**(1/np.sqrt(n)),2)
mse_knn = mean_squared_error(shap_true, shap_knn)
rmse_knn = np.sqrt(mse_knn)
print(rmse_knn)

0.04975204204700716


In [177]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans

# Load the dataset into a pandas dataframe
df = train_X

# Transpose the dataframe to cluster columns instead of rows
df = df.T
def normalize(col):
    return (col - col.mean()) / col.std()

df = df.apply(lambda col: normalize(col))
# Create a k-means clustering object with two clusters
kmeans = KMeans(n_clusters=2)

# Fit the k-means model to the data
kmeans.fit(df)

# Get the cluster labels for each column
labels = kmeans.labels_
df = df.T

# Assign each column to the corresponding cluster
df_cluster1 = df.iloc[:, labels == 0]
df_cluster2 = df.iloc[:, labels == 1]
df_cluster1 = df_cluster1.T
kmeans2 = KMeans(n_clusters=2)

# Fit the k-means model to the data
kmeans2.fit(df_cluster1)
labels2 = kmeans2.labels_
df_cluster1 = df_cluster1.T
df_cluster1.iloc[:,labels2==0]

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,Fouls Committed,Yellow Card,Yellow & Red,Red,Goals in PSO
88,-0.352370,0.061282,-0.275768,-0.329389,-0.321729,-0.329389,-0.314069,-0.344710,-0.291088,-0.337050,-0.268108,-0.344710,-0.352370,-0.352370,-0.352370
74,-0.370914,0.036280,-0.263353,-0.355548,-0.317133,-0.332499,-0.309450,-0.347865,-0.271036,-0.363231,-0.271036,-0.340182,-0.370914,-0.370914,-0.370914
120,-0.422764,0.052846,-0.203252,-0.373984,-0.337398,-0.361789,-0.386179,-0.422764,-0.227642,-0.398374,-0.361789,-0.410569,-0.434959,-0.434959,-0.434959
45,-0.396336,0.045296,-0.260449,-0.373688,-0.362364,-0.385012,-0.407660,-0.396336,-0.226478,-0.396336,-0.169858,-0.385012,-0.430307,-0.430307,-0.430307
77,-0.354284,0.128716,-0.309003,-0.339190,-0.346737,-0.361831,-0.331644,-0.354284,-0.218440,-0.354284,-0.256175,-0.346737,-0.369378,-0.369378,-0.369378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,-0.397673,0.115914,-0.365574,-0.397673,-0.386973,-0.397673,-0.397673,-0.408372,-0.237177,-0.365574,-0.205078,-0.376273,-0.408372,-0.408372,-0.408372
72,-0.420705,0.091328,-0.353333,-0.407231,-0.393756,-0.393756,-0.366807,-0.407231,-0.272485,-0.366807,-0.285960,-0.407231,-0.420705,-0.420705,-0.420705
12,-0.423457,0.132442,-0.231031,-0.359315,-0.348625,-0.370006,-0.391386,-0.370006,-0.198960,-0.402077,-0.316554,-0.412767,-0.423457,-0.423457,-0.423457
107,-0.369608,0.022291,-0.285630,-0.350946,-0.350946,-0.360277,-0.332284,-0.378939,-0.257637,-0.341615,-0.304291,-0.378939,-0.388270,-0.388270,-0.388270
