In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import collections

import itertools
import math
import random
import warnings
import time
warnings.filterwarnings('ignore')

In [2]:
data = [
    ["Barcelona F.C.", "Barcelona", "Spain", "La Liga", "2019", "1"],
    ["Athletico Madrid", "Madrid", "Spain", "La Liga", "2019", "2"],
    ["Real Madrid", "Madrid", "Spain", "La Liga", "2019", "3"],
    ["Barcelona F.C.", "Barcelona", "Catalonia", "La Liga", "2018", "1"],
    ["Athletico Madrid", "Capitol", "Espana", "La Liga", "2018", "2"],
    ["Real Madrid", "Madrid", "Spain", "La Liga", "2018", "3"],
]

df = pd.DataFrame(data, columns=["Team", "City", "Country", "League", "Year", "Rank"])

In [3]:
df

Unnamed: 0,Team,City,Country,League,Year,Rank
0,Barcelona F.C.,Barcelona,Spain,La Liga,2019,1
1,Athletico Madrid,Madrid,Spain,La Liga,2019,2
2,Real Madrid,Madrid,Spain,La Liga,2019,3
3,Barcelona F.C.,Barcelona,Catalonia,La Liga,2018,1
4,Athletico Madrid,Capitol,Espana,La Liga,2018,2
5,Real Madrid,Madrid,Spain,La Liga,2018,3


In [4]:
constraints = [1, 2, 3]

In [5]:
def algo(df, constraints):
    df_c = df.copy()
    if 1 in constraints:
        for i, r in df_c.iterrows():
            if r.Team != "NULL" and r.City != "NULL":
                errors = df_c[(df_c.Team == r.Team) & (df_c.City != r.City) & (df_c.City != "NULL")]
                if errors.shape[0] > 0:
                    c = collections.Counter(df_c[df_c.Team == r.Team].City)
                    if "NULL" in c:
                        c.pop("NULL")
                    if len(c) > 0:
                        df_c["City"][i] = c.most_common(1)[0][0] 

    if 2 in constraints:
        for i, r in df_c.iterrows():
            if r.League != "NULL" and r.Country != "NULL":
                errors = df_c[(df_c.League == r.League) & (df_c.Country != r.Country) & (df_c.Country != "NULL")]
                if errors.shape[0] > 0:
                    c = collections.Counter(df_c[df_c.League == r.League].Country)
                    if "NULL" in c:
                        c.pop("NULL")
                    if len(c) > 0:
                        df_c["Country"][i] = c.most_common(1)[0][0]
                                   
                        
    if 3 in constraints:
        for i, r in df_c.iterrows():
            if r.City != "NULL" and r.Country != "NULL":
                errors = df_c[(df_c.City == r.City) & (df_c.Country != r.Country) & (df_c.Country != "NULL")]
                if errors.shape[0] > 0:
                    c = collections.Counter(df_c[df_c.City == r.City].Country)
                    if "NULL" in c:
                        c.pop("NULL")
                    if len(c) > 0:
                        df_c["Country"][i] = c.most_common(1)[0][0]


    
    return df_c

In [6]:
algo(df, [1, 2, 3])

AttributeError: 'Series' object has no attribute 'County'

In [7]:
df

Unnamed: 0,Team,City,Country,League,Year,Rank
0,Barcelona F.C.,Barcelona,Spain,La Liga,2019,1
1,Athletico Madrid,Madrid,Spain,La Liga,2019,2
2,Real Madrid,Madrid,Spain,La Liga,2019,3
3,Barcelona F.C.,Barcelona,Catalonia,La Liga,2018,1
4,Athletico Madrid,Capitol,Espana,La Liga,2018,2
5,Real Madrid,Madrid,Spain,La Liga,2018,3


In [10]:
results = {}
cell = ("Country", 4)


for i in range(len(constraints) + 1):
    for comb in itertools.combinations(constraints, i):
        df_repair = algo(df, comb)
        results[comb] = df[cell[0]][cell[1]] != df_repair[cell[0]][cell[1]]
print (results)

{(): False, (1,): False, (2,): True, (3,): False, (1, 2): True, (1, 3): True, (2, 3): True, (1, 2, 3): True}


In [11]:
for i in constraints:
    shapley_value = 0
    for comb in results:
        if i not in comb:
            
            # Find comb_i             
            for comb_i in results:
                if i in comb_i and len(comb_i) == len(comb) + 1:
                    is_match = True
                    for x in comb:
                        if x not in comb_i:
                            is_match = False
                    if is_match:
                        break
            factor = math.factorial(len(comb)) * math.factorial(len(constraints) - len(comb) - 1)
            factor /= math.factorial(len(constraints))
            shapley_value += factor * (results[comb_i] - results[comb])
    print(i, shapley_value*100)

1 16.666666666666664
2 66.66666666666666
3 16.666666666666664


In [12]:
def random_combination(iterable, r):
    "Random selection from itertools.combinations(iterable, r)"
    pool = tuple(iterable)
    n = len(pool)
    indices = sorted(random.Random(time.time()).sample(range(n), r))
    return tuple(pool[i] for i in indices)

In [13]:
def calc_cell_shap(cell_shap, cell_repair, repeats=10, cols=None):
    i_shap, col_shap = cell_shap[0], cell_shap[1]
    i_repair, col_repair = cell_repair[0], cell_repair[1]

    cells = list(itertools.product(df.index, df.columns if cols is None else cols))
    cells.remove((i_repair, col_repair))
    cells.remove((i_shap, col_shap))
    
    shap = 0
    for i in range(repeats):
        df_c = df.copy()
        m = np.random.binomial(len(cells),0.5)
        comb = random_combination(cells, m)

        for cell in comb:
            df_c[cell[1]][cell[0]] = "NULL"

        df_repair = algo(df_c, constraints)
        is_repair_with = df_repair[col_repair][i_repair] == 'Spain'

        df_c[col_shap][i_shap] = "NULL"

        df_repair = algo(df_c, constraints)
        is_repair_without = df_repair[col_repair][i_repair] == 'Spain'

        a = int(is_repair_with) - int(is_repair_without)
        shap += a
    
    
    return shap 

In [30]:
cols = list(df.columns)
cols.remove("Rank")
cols.remove("Year")

In [31]:
cells = list(itertools.product(df.index, cols))
cell_repair = (4, "Country")
cells.remove(cell_repair)

results = {}
start = time.time()
for cell in cells:
    results[cell] = calc_cell_shap(cell, cell_repair, repeats=20, cols=cols) 
print(time.time() - start)

27.173850059509277


In [24]:
results
{k: v for k, v in sorted(results.items(), key=lambda item: -item[1])}
# print(results.values())

{(4, 'League'): 13,
 (1, 'Country'): 4,
 (2, 'Country'): 4,
 (0, 'League'): 3,
 (1, 'League'): 3,
 (4, 'Team'): 3,
 (4, 'City'): 2,
 (0, 'Country'): 1,
 (1, 'Team'): 1,
 (2, 'City'): 1,
 (2, 'League'): 1,
 (0, 'Team'): 0,
 (0, 'City'): 0,
 (0, 'Year'): 0,
 (0, 'Rank'): 0,
 (1, 'City'): 0,
 (1, 'Year'): 0,
 (1, 'Rank'): 0,
 (2, 'Team'): 0,
 (2, 'Year'): 0,
 (2, 'Rank'): 0,
 (3, 'Team'): 0,
 (3, 'City'): 0,
 (3, 'Country'): 0,
 (3, 'League'): 0,
 (3, 'Year'): 0,
 (3, 'Rank'): 0,
 (4, 'Year'): 0,
 (4, 'Rank'): 0,
 (5, 'Team'): 0,
 (5, 'Country'): 0,
 (5, 'League'): 0,
 (5, 'Year'): 0,
 (5, 'Rank'): 0,
 (5, 'City'): -1}