This notebook carries functions necessary to perform CDkM

## packages

In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
from sklearn.decomposition import FactorAnalysis
# from factor_analyzer import FactorAnalyzer
from sklearn.cluster import KMeans
## plotting packages
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.validators.scatter.marker import SymbolValidator
import plotly.figure_factory as ff

import matplotlib.pyplot as plt
import matplotlib.cm as cm
get_ipython().run_line_magic('matplotlib', 'inline')

plt.rcParams["figure.figsize"] = (20,10)

## other helpful packages 
from itertools import combinations, permutations, combinations_with_replacement
from collections import Counter
import networkx as nx
# !pip install python-louvain 
import community

## import these dictionaries (find on Github (Translation/Helper), store in directory)
from pid2pos_bref2nba_nba2bref_pid2name_name2pid import *

import decimal

def round_down(value, decimals):
    with decimal.localcontext() as ctx:
        d = decimal.Decimal(value)
        ctx.rounding = decimal.ROUND_DOWN
        return round(d, decimals)
    
import pickle
import math
from scipy import stats

### read in dataframes

User will need to create their own directory / folder containing dfs (stored as csv on Github)

In [None]:
allDFs = [
   'MasterClutch','MasterDefense', 'MasterRebound', 'MasterPassing', 'MasterScoring' ,'MasterMisc',  
]

folder = 'folder_name'

for dfStr in allDFs:
    vars()[dfStr] = pd.read_csv(f'{folder}/{dfStr}.csv')


### begin CDkM

In [None]:
### helper ###
# drop unneeded/redundant columns
colDrop = [
    'Unnamed: 0', 'GP', 'Season', 'comb', 'PLAYER_ID'
]

def col2str(item):
        return str(item)
    
### pre-processing ###
def scale_and_count_DFs(DFs, seasons):
    """
    INPUT
    DFs = allDFs
    season = specified by user
    
    OUTPUT
    Counter(totAppear): dictionary; counts number of times players appear in same DF; used as denominator when building network arc weights. 
    idx_DFs: placeholder DF; contains general items of interest for tracking and reporting
    scaled_DFs: returns Standard Scaler'ed version of original DFs for PCA and other analysis
    """
    # standardize data
    scaler = StandardScaler()

    idx_DFs = []
    scaled_DFs = []
    totAppear = []
    for dfStr in DFs:
        vars()[dfStr] = pd.read_csv(f'{folder}/{dfStr}.csv')
        temp = vars()[dfStr].copy()
        allYears = []
        for s in seasons: 
            temp0 = temp[(temp.Season == s)].copy()
            allYears.append(temp0)
        temp = pd.concat(allYears, ignore_index=True)      

        # create idx tracker
        vars()[dfStr + '_idx'] = temp.loc[:,['GP', 'Season', 'comb', 'PLAYER_ID']]
        idx_DFs.append(vars()[dfStr + '_idx'])

        # count how many times players appear to scale counts later
        temp_ = temp.sort_values(by='PLAYER_ID')
        temp_ = temp_['PLAYER_ID'].apply(col2str) + '_' + temp_['Season'].apply(col2str)
        comb = list(combinations(temp_, 2))
        totAppear.extend(comb)


        # subset into kept columns
        temp = temp.drop(columns=colDrop)

        # create scaled df
        vars()[dfStr + '_scaled'] = scaler.fit_transform(temp)
        scaled_DFs.append(vars()[dfStr + '_scaled'])
    return Counter(totAppear), idx_DFs, scaled_DFs

def pca_tf(DFs, scaled_DFs, var2keep): 
    """
    INPUT
    var2keep: scaler; variance to keep for PCA
    
    OUTPUT
    allPCA: PCA transformed DFs
    """
    ## pca first
    allPCA = []
    comp2keep = {}
    running = 0
    for idx, dfStr in enumerate(DFs):
        # get number of components to keep
        temp = scaled_DFs[idx]
        pca = PCA().fit(temp)
        comp2keep[dfStr + '_scaled'] = np.where(np.cumsum(pca.explained_variance_ratio_)>var2keep)[0][0]
        running += np.where(np.cumsum(pca.explained_variance_ratio_)>var2keep)[0][0]  
        
    for k,v in comp2keep.items(): 
        print('For {}, {} components were kept.'.format(k,v))
    
    # then transofrm scaled dfs into pca dfs
    for idx, dfStr in enumerate(DFs):
        newStr = dfStr + '_scaled'
        temp = scaled_DFs[idx]
        best_k = comp2keep[newStr]
        pca_tf = PCA(n_components=best_k)
        dfPca = pca_tf.fit_transform(temp)
        allPCA.append(dfPca)
    return allPCA

### CDkM algorithm ###
## now begin clustering for each df, keep track of connections in a dictionary

# create results dataframe after clustering using kMeans
def clusterResults(idx_DFs, scaled_DFs, n_list, pid2pos, clusterMethod='kmeans'):
    """
    INPUT
    n_list: range of k's to use; ex: [[i]*numdf for i in range(2,151)]
    
    OUTPUT
    allResults: list of resulting cluster DFs (one for each master DF); cols = ['GP', 'szn', 'comb', 'pid', 'cluster', 'df', 'pos']
    """
    allResults = []
    for i in range(len(idx_DFs)):
        df = scaled_DFs[i]
        kmeans = KMeans(n_clusters=n_list[i], random_state=13).fit(df)
        labels_ = kmeans.labels_
        results = pd.DataFrame(columns=['GP', 'szn', 'comb', 'pid', 'cluster', 'pos', 'df'])
        results[['GP', 'szn', 'comb', 'pid']] = idx_DFs[i]
        results['cluster'] = labels_
        results['df'] = allDFs[i]
        for idx, row in results.iterrows():
            pl = str(row.pid) + '_' + str(row.szn)
            if pl in pid2pos:
                results.loc[idx,'pos'] = pid2pos[pl]
            else:
                results.loc[idx,'pos'] = 'nan'
        allResults.append(results)
    return allResults

# count how often each pair occurs in same cluster
def createCountDict(allResults, pid2name):
    """
    OUTPUT
    countDict: key: player pair; value: raw # times pair appear in same cluster
    countDictMatch: key: player pair; value: which master DF categories they match in for post analysis
    """
    countDict = {}
    countDictMatch = {}
    for ix,df in enumerate(allResults):
        df = df.sort_values(by='pid')
        for cl in df.cluster.unique():
            temp = df[(df.cluster==cl)]
            nodes = temp['pid'].apply(col2str) + '_' + temp['szn'].apply(col2str)
            comb = list(combinations(nodes,2))
            for pair in comb:
                if int(pair[0].split('_')[0]) in pid2name and int(pair[1].split('_')[0]) in pid2name:
                    if pid2name[int(pair[0].split('_')[0])] == pid2name[int(pair[1].split('_')[0])]:
                        continue
                    if pair in countDict:
                        countDict[pair] += 1
                        countDictMatch[pair].append(allDFs[ix])
                    else:
                        countDict[pair] = 1  
                        countDictMatch[pair] = [allDFs[ix]]
    return countDict, countDictMatch

# different ways for determining arc weights (for testing, user can try out different schemes)
def calcVals(countDict, numAppear, destStr):
    """
    OUTPUT
    weightDF: write scaled arc-weight results to csv
    """
    with open(destStr, 'w') as f:
        f.write('player1,player2,val_3,val_6,raw,total\n')
        for key in countDict.keys():
            if numAppear[key] == 0:
                continue
            
            # zero or one
            v1 = int(countDict[key]/numAppear[key])
            
            # round to two decimals
            v2 = float(round_down(countDict[key]/numAppear[key], 2))
            
            v7 = countDict[key]
            
            if v2 <= 0.2:
                v3 = 0
            elif v2 <= 0.4:#0.35:
                v3 = 1
            elif v2 <= 0.6:#0.5:
                v3 = 2
            elif v2 <= 0.8:#0.7:
                v3 = 3
#             elif v2 <= 0.85:
#                 v3 = 4
            else:
                v3 = 4
                
            v4 = float(round_down(countDict[key]/numAppear[key], 2))

            
            if v4 <= 0.25:
                v6 = 0
            elif v4 <= 0.5:
                v6 = 1
            elif v4 <= 0.75:
                v6 = 2
            else:
                v6 = 3
                        
            f.write('{},{},{},{},{},{}\n'.format(key[0],key[1],v3,v6,v4,v7))
        f.close()
        return pd.read_csv(destStr)  

# using weights found, perform community detection
def getModularity(nodeDF, valDF):
    """
    INPUT
    nodeDF: from weightDF, players (2 columns)
    valDF: from weightDF, value chosen (1 column)
    
    OUTPUT
    partition: output from Louvain algorithm
    G: create graph using networkx
    mod: scalar; modularity of Louvain algorithm results
    numGrps: scalar; |partitions|
    """
    # create df
    graphDF = pd.concat([nodeDF, valDF], axis=1)
    graphDF.columns = ['node1', 'node2', 'value']
    # build graph
    G = nx.Graph()
    for idx,row in graphDF.iterrows():
        G.add_edge(row.node1, row.node2, weight=row.value)
    # partition into clusters
    partition = community.best_partition(G,randomize=False, random_state=13)
    uniqueGrp = set()
    for key,val in partition.items():
        uniqueGrp.add(val)
    numGrps = len(uniqueGrp)
    # get modularity 
    mod = community.modularity(partition, G)
    return partition, G, mod, numGrps

### execution ###

## run experiments
def experiment(allDFs, szn, n_list, pid2pos_bref, pid2name, tempStr, destStr, valcol, returnPartition=False):
    numAppear, idx_DFs, scaled_DFs = scale_and_count_DFs(allDFs, szn)
    pcaDFs = pca_tf(allDFs, scaled_DFs)
    allResults = clusterResults(idx_DFs, pcaDFs, n_list, pid2pos_bref)
    countDict, countDictMatch = createCountDict(allResults, pid2name)
    dict2DF(countDictMatch, pidszn2name, tempStr)
    valDF = calcVals(countDict, numAppear, destStr)
    nodeDF = valDF[['player1', 'player2']]
    partition, G, mod, numGrp = getModularity(nodeDF, valDF[valCol])
    
    if returnPartition:
        return partition, G, mod, numGrp
    else:
        return mod, numGrp
    
## USER TODO!!! Run experiments and save experDF to csv (can change to function to create experDF)
Yr = ['19_20'] ## change as wanted
seasonz = [['2019-20']] ## change as wanted
valCols = ['val_3','val_6', 'raw'] ## change as wanted
numdf = len(allDFs)
n_lists = [[i]*numdf for i in range(2,151)] ## change as wanted
## user define:
name_of_exper = 'tbd'
where2save = 'tbd'
with open(f'{name_of_exper}.csv', 'w') as f:
    f.write('szn,k,col,mod,numPartitions\n')
    for ix,szn in enumerate(seasonz):
        for i,n_list in enumerate(n_lists):
            k = n_list[0]
            for col in valCols:
                tempStr = '{}/{}_{}_{}.csv'.format(where2save,k,col,Yr[ix])
                dictStr = '{}/dict{}_{}_{}.txt'.format(where2save,k,col,Yr[ix])
                mod, numGrp = experiment(allDFs, szn, n_list, pid2pos_bref, pid2name, dictStr, tempStr, col, var2keep=0.99, returnPartition=False)#var2keep=0.99,
                f.write('{},{},{},{},{}\n'.format(Yr[ix],k,col,mod,numGrp))
f.close()
    
## modularity frontier
# helper
def makeAndWriteGroupDF(partition, pid2pos, pid2name, year, show=False, saveStr=None):
    """
    OUTPUT
    partitionDF: translates partition (from community) into DF (pandas)
    """
    partitionDF = pd.DataFrame.from_dict(partition, orient='index').reset_index()
    partitionDF.columns=['id', 'group']
    partitionDF['pos'] = 'nan'
    partitionDF['name'] = 'nan'
    partitionDF['season'] = 'nan'
    for idx, row in partitionDF.iterrows():
        if row.id in pid2pos_bref:
            partitionDF.loc[idx,'pos'] = pid2pos[row.id]

        player = row.id.split('_')
        if int(player[0]) in pid2name:
            partitionDF.loc[idx,'name'] = pid2name[int(player[0])]
            partitionDF.loc[idx,'season'] = player[1]
                    
    # show
    if show:
        for cl in partitionDF['group'].unique():
            temp = partitionDF[(partitionDF.group==cl)]
            print('----------------------------------')
            print('Group {} ({}) ({} players)'.format(cl, year, temp.shape[0]))
            print('----------------------------------')
            for idx, row in temp.iterrows():
                print(row['name'], row.pos)
    # write
    if saveStr:
        with open(saveStr, 'w') as f:
            for cl in partitionDF['group'].unique():
                temp = partitionDF[(partitionDF.group==cl)]
                f.write('------------------------\n')
                f.write('Group {} ({} players)\n'.format(cl, temp.shape[0]))
                f.write('------------------------\n')
                for idx, row in temp.iterrows():
                    f.write(row['name'] + ' ' + row['season'] + ',' + row.pos + '\n')
        f.close()

    return partitionDF

# calculate percentages for double mod frontier (with non single percent)
def getNonSingles_Mod(year):
    storeGrps = []
    storeSingle = []
    storeMod = []
    storeNonSingle = []
    for best_k in [int(i) for i in (np.arange(.2*400)[::2]+3)]:
        ## read best exper
        bestStr = '{directory_where_results_saved}{}_{}_{}.csv'.format(best_k,'val_6',year)
        best = pd.read_csv(bestStr)

        partition, G, mod, numGrps = getModularity(best[['player1', 'player2']], best['val_6'])
        storeGrps.append(numGrps)
        storeMod.append(mod*100)

        saveStr_ = '{}/groups_{}.txt'.format(directory_saved,year)
        partitionDF = makeAndWriteGroupDF(partition,pid2pos_bref,pid2name,year=year, show=False, saveStr=False)
        numSingle = 0
        for cl in partitionDF['group'].unique():
            shape = partitionDF[(partitionDF['group']==cl)].shape[0]
            if shape == 1:
                numSingle += 1
        nonSingle = ((numGrps-numSingle)/numGrps)*100
        storeSingle.append(numSingle)
        storeNonSingle.append(nonSingle)
    return storeSingle, storeNonSingle, storeMod, storeGrps

def modFrontier3(experDF, hix, year, storeSingle, storeNonSingle, storeMod, storeGrps):
    """
    INPUT 
    experDF: output of running many experiments; cols = [szn,k,col,mod,numPartitions]; read in csv created or create function to store experDF
    hix: specified upper bound for graphing modularity frontier
    year: chosen season
    
    OUTPUT
    graph (plotly): last step of CDkM; user must analyze and choose best k
    """
    cond = hix
    exper_ = experDF[(experDF['szn']==year)]
    df = exper_[(exper_['col']=='val_6')]
    fig = go.Figure()
    temp = df[(df.numPartitions<=hix)]

    modPercent = [100*i for i in temp['mod']]
    fig.add_trace(go.Scatter(x=temp['numPartitions'], y=storeMod,#modPercent
                             marker=dict(
        size=5,
        color=temp['k'], #set color equal to a variable
        colorscale='Portland', # one of plotly colorscales
        showscale=True, colorbar=dict(
            title="k-values")
    ) ,hovertext=[str(i) for i in [int(i) for i in (np.arange(.2*400)[::2]+3)]],#list(zip(temp['col'], temp['k']))],
    hoverinfo="text",
               mode='markers',showlegend=False))


    def condition(x): return x <= cond
    boolArr = condition(np.array(storeGrps))
    x = list(compress(storeGrps, boolArr))
    y1=list(compress(storeMod, boolArr))


    def log_func(x,a,b):
        return a+b*np.log(x)

    popt1, pcov = curve_fit(lambda t,a,b: a+b*np.log(t),  x,  y1)


    xx = np.linspace(3, cond, 1000)
    yy1 = log_func(xx, *popt1)
    y2 = list(compress(storeNonSingle, boolArr))

    def exp_func(x, a, b,c,d):
        return a * np.exp(-b * x+c)+d

    popt2, pcov = curve_fit(lambda t,a,b,c,d: a * np.exp(-b * t+c)+d, x, y2)#curve_fit(exp_func, x, y2)
    yy2 = exp_func(xx, *popt2)
    fig.add_trace(go.Scatter(
    x=xx, y=yy1, showlegend=False
    ))


    fig = go.Figure()
    # non singletons
    fig.add_trace(go.Scatter(x=x, y=storeNonSingle, mode='markers', name='Non-Singleton (NS) Groups - Experiment (%)', 
        marker=dict(
        symbol='13',
        size=7,
        color=temp['k'], #set color equal to a variable
        colorscale='Portland', # one of plotly colorscales
        showscale=True, colorbar=dict(
            title="k-values")
    ), showlegend=True,hovertext=[str(i) for i in list(temp['k'])]))#, name='nonSingleVals'))

    # modularity
    fig.add_trace(go.Scatter(x=x, y=storeMod, mode='markers', name='Modularity (Mod) - Experiment (%)',
        marker=dict(
        symbol='19',
        size=7,
        color=temp['k'], #set color equal to a variable
        colorscale='Portland', # one of plotly colorscales
        showscale=True, colorbar=dict(
            title="k-values")
    ), showlegend=True,hovertext=[str(i) for i in list(temp['k'])]))#, name='modVals'))


    # approximations
    fig.add_trace(go.Scatter(x=xx, y=yy2, mode='markers', showlegend=True, name='NS Groups - Approximated (%)',marker=dict(size=2)))
    fig.add_trace(go.Scatter(x=xx, y=yy1, mode='markers', showlegend=True, name='Mod - Approximated (%)',marker=dict(size=2)))
    # fig.update_traces(marker_size=4)
    fig.update_layout(legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    ))


    # line
    fig.update_layout(
        shapes=[
            dict(type="line", xref="x", yref="y",
                x0=5, y0=0.3, x1=hix, y1=0.3, line_width=3)]
    )

    # arrows
    fig.add_trace(
        go.Scatter(
            mode='markers',
            x=[5,hix],
            y=[0.3,0.3],
            marker_symbol=[7,8],
            showlegend=False,marker_line_width=2, marker_size=12,marker_line_color="midnightblue", marker_color="midnightblue"
        )
    )

    # fig.show()

    # bias var
    fig.add_annotation(x=7, y=0.32,
                text='Bias',
                showarrow=False,
                yshift=10)

    fig.add_annotation(x=hix-2, y=0.32,
                text='Variance',
                showarrow=False,
                yshift=10)

    
    font_ = "Times New Roman"
    fig.update_layout(
    font_family=font_,
    title_font_family=font_,
)
            

    fig.update_layout(title=f'{szn} Modularity Frontier', xaxis_title='Number of Macro Clusters', yaxis_title='Percent', title_x=0.5)

    fig.show()
