In [41]:
import numpy as np
import pandas as pd
from itertools import product
from helper_functions.clean_transitions import clean_transition_column

In [2]:
data = pd.read_excel('ALL_VALID_TRANSITIONS.xlsx')

In [3]:
data.head()

Unnamed: 0,Transition,Period,filename,is_valid
0,"('Bi0', 'Ai3')",1,6197002_Play-By-Play for Colgate vs Weber St.....,True
1,"('Ai3', 'Ar0')",1,6197002_Play-By-Play for Colgate vs Weber St.....,True
2,"('Ar0', 'Ar0')",1,6197002_Play-By-Play for Colgate vs Weber St.....,True
3,"('Ar0', 'Br0')",1,6197002_Play-By-Play for Colgate vs Weber St.....,True
4,"('Br0', 'Ar0')",1,6197002_Play-By-Play for Colgate vs Weber St.....,True


In [4]:
#going to test a lopsided game - marquette vs depaul on 2/21/2024 where marquette won 105-71

In [32]:
marq_data = data[data['filename'].str.contains('Marquette')].copy().reset_index(drop=True)

In [33]:
#is number of games reasonable
len(marq_data['filename'].unique())

19

In [34]:
#let's force marquette as team A and depaul as team B

In [35]:
marq_data

Unnamed: 0,Transition,Period,filename,is_valid
0,"('Bi0', 'Ai2')",1,6197093_Play-By-Play for Marquette vs UCLA.xlsx,True
1,"('Ai2', 'Br0')",1,6197093_Play-By-Play for Marquette vs UCLA.xlsx,True
2,"('Br0', 'Ar0')",1,6197093_Play-By-Play for Marquette vs UCLA.xlsx,True
3,"('Ar0', 'Bi3')",1,6197093_Play-By-Play for Marquette vs UCLA.xlsx,True
4,"('Bi3', 'Ar0')",1,6197093_Play-By-Play for Marquette vs UCLA.xlsx,True
...,...,...,...,...
3742,"('Bi3', 'Bf0')",2,6200793_Play-By-Play for UConn vs Marquette.xlsx,True
3743,"('Bf0', 'Br0')",2,6200793_Play-By-Play for UConn vs Marquette.xlsx,True
3744,"('Br0', 'Ar0')",2,6200793_Play-By-Play for UConn vs Marquette.xlsx,True
3745,"('Ar0', 'Ai0')",2,6200793_Play-By-Play for UConn vs Marquette.xlsx,True


In [36]:
#primitive method - iterate through unique games. Wherever "marquette" comes after "vs",
#operate on transition column. Replace "A" with "TEMP", replace "B" with "A", replace "TEMP" with "B"

#can't quite get this. even more primitive - iterate through all rows...

In [37]:
for fname in marq_data['filename'].unique():
    if fname.find('Marquette')>fname.find('vs'):
        marq_data.loc[marq_data['filename']==fname, 'Transition'] = marq_data.loc[marq_data['filename']==fname, 'Transition'].str.replace('A','TEMP')
        marq_data.loc[marq_data['filename']==fname, 'Transition'] = marq_data.loc[marq_data['filename']==fname, 'Transition'].str.replace('B','A')
        marq_data.loc[marq_data['filename']==fname, 'Transition'] = marq_data.loc[marq_data['filename']==fname, 'Transition'].str.replace('TEMP','B')

In [30]:
# for i in range(len(marq_data)):
#     curr_name = marq_data['filename'][i]
#     if curr_name.find('Marquette')>curr_name.find('vs'):
#         string_copy = marq_data['Transition'][i]
#         new_string = string_copy.replace('A','TEMP').replace('B','A').replace('TEMP','B')
#         marq_data.loc[i, 'Transition']=new_string

In [42]:
#now clean transition column
marq_data['Transition']=clean_transition_column(marq_data['Transition'])

In [40]:
possible_states = ['Ai0',
'Ai1',
'Ai2',
'Ai3',
'Ar0',
'Af0',
'Af1',
'Af2',
'Af3',
'Bi0',
'Bi1',
'Bi2',
'Bi3',
'Br0',
'Bf0',
'Bf1',
'Bf2',
'Bf3']

possible_transitions = list(product(possible_states,possible_states))

In [56]:

transitions_agg_marq = marq_data.groupby('Transition')['Period'].count().reset_index()

In [57]:
for t in possible_transitions:
    if t not in list(transitions_agg_marq['Transition'].values):
        curr_row = pd.DataFrame([[t, 0]], columns=['Transition','Period'])
        transitions_agg_marq=pd.concat([transitions_agg_marq,curr_row])

In [58]:
transitions_agg_marq = transitions_agg_marq.rename(columns={'Period':'Count'})

In [59]:
transitions_agg_marq

Unnamed: 0,Transition,Count
0,"(Af0, Af0)",33
1,"(Af0, Af1)",90
2,"(Af0, Ar0)",1
3,"(Af0, Bi0)",1
4,"(Af0, Bi1)",25
...,...,...
0,"(Bf3, Br0)",0
0,"(Bf3, Bf0)",0
0,"(Bf3, Bf1)",0
0,"(Bf3, Bf2)",0


In [60]:
transitions_agg_marq['Starting_State'] = [x[0] for x in transitions_agg_marq['Transition']]
transitions_agg_marq['Ending_State'] = [x[1] for x in transitions_agg_marq['Transition']]

In [61]:
transitions_agg_marq=transitions_agg_marq[['Starting_State','Ending_State','Count']]

In [62]:
transition_matrix_marq = transitions_agg_marq.pivot(index='Starting_State', columns='Ending_State', values='Count').fillna(0)

In [63]:
transition_matrix_marq = transition_matrix_marq.div(transition_matrix_marq.sum(axis=1), axis=0)

In [64]:
transition_matrix_marq=transition_matrix_marq.fillna(0)

In [65]:
transition_matrix_marq

Ending_State,Af0,Af1,Af2,Af3,Ai0,Ai1,Ai2,Ai3,Ar0,Bf0,Bf1,Bf2,Bf3,Bi0,Bi1,Bi2,Bi3,Br0
Starting_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Af0,0.2,0.545455,0.0,0.0,0.0,0.0,0.0,0.0,0.006061,0.0,0.0,0.0,0.0,0.006061,0.151515,0.0,0.0,0.090909
Af1,0.0,0.042105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.768421,0.0,0.0,0.189474
Af2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.636364,0.0,0.0,0.318182
Af3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ai0,0.068027,0.0,0.006803,0.0,0.047619,0.0,0.0,0.0,0.340136,0.0,0.0,0.0,0.0,0.027211,0.0,0.14966,0.108844,0.251701
Ai1,0.084906,0.0,0.0,0.0,0.04717,0.0,0.0,0.0,0.09434,0.0,0.0,0.0,0.0,0.009434,0.0,0.188679,0.122642,0.45283
Ai2,0.058182,0.0,0.021818,0.0,0.076364,0.0,0.0,0.0,0.12,0.0,0.0,0.0,0.0,0.010909,0.0,0.214545,0.109091,0.389091
Ai3,0.082278,0.0,0.031646,0.0,0.056962,0.0,0.0,0.0,0.088608,0.0,0.0,0.0,0.0,0.012658,0.0,0.202532,0.082278,0.443038
Ar0,0.0898,0.0,0.011086,0.0,0.059867,0.0,0.0,0.0,0.09867,0.0,0.0,0.0,0.0,0.012195,0.0,0.231707,0.107539,0.389135
Bf0,0.0,0.0,0.0,0.0,0.0,0.137255,0.0,0.0,0.071895,0.176471,0.614379,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
dep_data = data[data['filename'].str.contains('DePaul')].copy().reset_index(drop=True)

In [67]:
len(dep_data['filename'].unique())

18

In [68]:
for fname in dep_data['filename'].unique():
    if fname.find('DePaul')<fname.find('vs'):
        dep_data.loc[dep_data['filename']==fname, 'Transition'] = dep_data.loc[dep_data['filename']==fname, 'Transition'].str.replace('A','TEMP')
        dep_data.loc[dep_data['filename']==fname, 'Transition'] = dep_data.loc[dep_data['filename']==fname, 'Transition'].str.replace('B','A')
        dep_data.loc[dep_data['filename']==fname, 'Transition'] = dep_data.loc[dep_data['filename']==fname, 'Transition'].str.replace('TEMP','B')

In [69]:
#now clean transition column
dep_data['Transition']=clean_transition_column(dep_data['Transition'])

In [76]:

transitions_agg_dep = dep_data.groupby('Transition')['Period'].count().reset_index()

In [77]:
for t in possible_transitions:
    if t not in list(transitions_agg_dep['Transition'].values):
        curr_row = pd.DataFrame([[t, 0]], columns=['Transition','Period'])
        transitions_agg_dep=pd.concat([transitions_agg_dep,curr_row])

In [78]:
transitions_agg_dep = transitions_agg_dep.rename(columns={'Period':'Count'})

In [79]:
transitions_agg_dep['Starting_State'] = [x[0] for x in transitions_agg_dep['Transition']]
transitions_agg_dep['Ending_State'] = [x[1] for x in transitions_agg_dep['Transition']]

In [80]:
transitions_agg_dep=transitions_agg_dep[['Starting_State','Ending_State','Count']]

In [81]:
transition_matrix_dep = transitions_agg_dep.pivot(index='Starting_State', columns='Ending_State', values='Count').fillna(0)

In [84]:
transition_matrix_dep = transition_matrix_dep.div(transition_matrix_dep.sum(axis=1), axis=0)

In [85]:
transition_matrix_dep=transition_matrix_dep.fillna(0)

In [86]:
transition_matrix_dep

Ending_State,Af0,Af1,Af2,Af3,Ai0,Ai1,Ai2,Ai3,Ar0,Bf0,Bf1,Bf2,Bf3,Bi0,Bi1,Bi2,Bi3,Br0
Starting_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Af0,0.180851,0.601064,0.0,0.0,0.0,0.0,0.0,0.0,0.031915,0.0,0.0,0.0,0.0,0.0,0.132979,0.0,0.0,0.053191
Af1,0.008547,0.034188,0.0,0.0,0.0,0.0,0.0,0.0,0.042735,0.0,0.0,0.0,0.0,0.0,0.760684,0.0,0.0,0.153846
Af2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.833333,0.0,0.0,0.166667
Af3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Ai0,0.065041,0.0,0.02439,0.0,0.065041,0.0,0.0,0.0,0.325203,0.0,0.0,0.0,0.0,0.04065,0.0,0.154472,0.089431,0.235772
Ai1,0.084746,0.0,0.0,0.0,0.016949,0.0,0.0,0.0,0.161017,0.0,0.0,0.0,0.0,0.016949,0.0,0.194915,0.135593,0.389831
Ai2,0.066421,0.0,0.02583,0.00369,0.055351,0.0,0.0,0.0,0.088561,0.0,0.0,0.0,0.0,0.01476,0.0,0.206642,0.136531,0.402214
Ai3,0.126214,0.0,0.029126,0.0,0.029126,0.0,0.0,0.0,0.126214,0.0,0.0,0.0,0.0,0.009709,0.0,0.145631,0.135922,0.398058
Ar0,0.118451,0.0,0.013667,0.0,0.056948,0.0,0.0,0.0,0.123007,0.0,0.0,0.0,0.0,0.01139,0.0,0.248292,0.107062,0.321185
Bf0,0.0,0.0,0.0,0.0,0.0,0.128866,0.0,0.0,0.123711,0.201031,0.546392,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
#first, try combining them by discrete "beneficial" columns then re-normalize
marq_sub_matrix = transition_matrix_marq[['Af0', 'Af1', 'Af2', 'Af3', 'Ai0', 'Ar0', 'Bi1', 'Bi2', 'Bi3']].copy()
dep_sub_matrix = transition_matrix_dep[['Bf0', 'Bf1', 'Bf2', 'Bf3', 'Bi0', 'Br0', 'Ai1', 'Ai2', 'Ai3']].copy()

combined_transitions_1 = pd.concat([marq_sub_matrix,dep_sub_matrix], axis=1)

In [102]:
combined_transitions_1 = combined_transitions_1.div(combined_transitions_1.sum(axis=1), axis=0)

In [103]:
combined_transitions_1.to_excel('Marquette_A_DePaul_B_Transitions_v1.xlsx')

In [104]:
combined_transitions_2 = (transition_matrix_marq + transition_matrix_dep)/2

In [105]:
combined_transitions_2 = combined_transitions_2.div(combined_transitions_2.sum(axis=1), axis=0)

In [106]:
combined_transitions_2.to_excel('Marquette_A_DePaul_B_Transitions_v2.xlsx')