In [5]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

from sklearn.preprocessing import OneHotEncoder


In [6]:
# The ids do not match between the two datasets, so we will need to do something suspicious...
def extract_numbers(s):
    # Pull out just the set number
    match = re.findall(r'\d+', s)
    return ''.join(match).strip('0') if match else None

In [7]:

def encode_decklists(decklists, vocabulary):
    encoder = OneHotEncoder(categories=[vocabulary], sparse_output=False)
    encoder.fit(np.array(vocabulary).reshape(-1, 1))
    one_hot_vectors = np.array([encoder.transform(np.array(decklist).reshape(-1,1)).sum(axis=0) for decklist in tqdm(decklists)])
    return one_hot_vectors


In [28]:
rotation_name = 'standard_2021'
pokemon_df = pd.read_csv(f'output/{rotation_name}_pokemon_power_level.csv')
trainer_df = pd.read_csv(f'output/{rotation_name}_trainer_power_level.csv')
trainer_df['temp_id'] = trainer_df['name']
combined_df = pd.concat([pokemon_df, trainer_df])
combined_df.head()



Unnamed: 0,id,set,series,publisher,generation,release_date,artist,name,set_num,types,...,regulationMark,ancientTrait,cleaned_attacks,cleaned_abilities,cleaned_rules,2021_exception,temp_id,power_level,tournamentYear,name_card
0,bw7-46,Boundaries Crossed,Black & White,TPCI,Fifth,11/7/2012,Kouki Saitou,Cryogonal,46,['Water'],...,none,none,name: Ice Edge cost: Water Colorless converted...,none,none,False,46-cryogonal,0.003788,2021,
1,xy5-24,Primal Clash,XY,TPCI,Sixth,2/4/2015,Kagemaru Himeno,Magcargo,24,['Fire'],...,none,"{'name': 'Ω Barrier', 'text': 'Whenever your o...",name: Ram cost: Fire Colorless convertedEnergy...,none,none,False,24-magcargo,0.060606,2021,
2,smp-SM107,SM Black Star Promos,Sun & Moon,TPCI,Seventh,2/3/2017,Shin Nagasawa,Dusk Mane Necrozma,SM107,['Metal'],...,none,none,name: Dusk Shot cost: Metal convertedEnergyCos...,none,none,True,107-duskmanenecrozma,0.003788,2021,
3,smp-SM117,SM Black Star Promos,Sun & Moon,TPCI,Seventh,2/3/2017,Shin Nagasawa,Malamar,SM117,['Psychic'],...,none,none,name: Psychic Sphere cost: Psychic Psychic Col...,name: Psychic Recharge text: Once during your ...,none,True,117-malamar,0.011364,2021,
4,smp-SM151,SM Black Star Promos,Sun & Moon,TPCI,Seventh,2/3/2017,Shin Nagasawa,Giratina,SM151,['Psychic'],...,none,none,name: Shadow Impact cost: Psychic Psychic Colo...,"name: Distortion Door text: ""Once during your ...",none,True,151-giratina,0.003788,2021,


In [29]:
decks_df = pd.read_csv('tournaments.csv')
decks_df = decks_df[decks_df['rotation_name']==rotation_name]
# decks_df = decks_df[~decks_df['rotation_name'].isin(['standard_2021', 'standard_2022', 'standard_2023', 'standard_2024'])]
decks_df['temp_id'] = decks_df['id_card'].apply(extract_numbers) + '-' + decks_df['name_card'].str.replace('Pokemon Card Gym Medal', 'heliolisk').str.replace('Palace Book', 'zubat').str.replace(' ', '').str.lower()
decks_df['temp_id'] = decks_df.apply(lambda row: row.name_card if row.type_card=='Trainer' else row.temp_id, axis=1)
decks_df = decks_df[decks_df['name_card'] != 'Mewtwo V-UNION'] # mewtwo not in cards data

In [30]:
decklists = decks_df.groupby(['id_player', 'id_tournament'])['temp_id'].apply(list).reset_index(name='decklist')
decklists

Unnamed: 0,id_player,id_tournament,decklist
0,1440,292,"[191-mewtwo&mew-gx, 8-galarianzapdosv, 57-dede..."
1,1604,289,"[161-jirachi, 32-blacephalon, 52-blacephalon-g..."
2,1804,289,"[18-zacianv, 19-zamazentav, 192-lucario&melmet..."
3,1807,289,"[98-coalossalv, 99-coalossalvmax, 23-slugma, 2..."
4,1833,289,"[168-pikachu&zekrom-gx, 6-vikavoltv, 161-jirac..."
...,...,...,...
61,3410,292,"[33-centiskorchv, 34-centiskorchvmax, 179-volc..."
62,3413,292,"[156-arceus&dialga&palkia-gx, 18-zacianv, 141-..."
63,3414,292,"[87-rapidstrikeurshifuv, 88-rapidstrikeurshifu..."
64,3415,292,"[104-victiniv, 22-victinivmax, 191-mewtwo&mew-..."


In [31]:
card_ids = combined_df['id'] # for later use
vocabulary = combined_df['temp_id'] # temp_id is already unique
decklists_encoded = encode_decklists(decklists['decklist'], vocabulary)

100%|██████████| 66/66 [00:00<00:00, 634.21it/s]


In [32]:
idx_to_card = {i:card for i, card in enumerate(vocabulary)}
card_matrix = np.zeros((len(vocabulary), len(vocabulary)))

card_counts = np.zeros(len(vocabulary)) # store a count of each card occurrence

for decklist in tqdm(decklists_encoded):
    for idx, indicator in enumerate(decklist):
        if indicator!=0: # if the card is in the deck
            card_counts[idx] += 1
            card_matrix[idx] += decklist


100%|██████████| 66/66 [00:00<00:00, 4021.03it/s]


In [33]:
np.all(card_matrix.T == card_matrix)

True

In [34]:
card_synergies = np.nan_to_num(card_matrix/card_counts.reshape(-1,1)/card_counts.reshape(1,-1)) # need to convert nan to 0 because some cards never occur
np.fill_diagonal(card_synergies, 0) # make synergy with self zero

  card_synergies = np.nan_to_num(card_matrix/card_counts.reshape(-1,1)/card_counts.reshape(1,-1)) # need to convert nan to 0 because some cards never occur


In [35]:
usecols = ['id', 'regulationMark', 'hp', 'name', 'types', 'subtypes', 'evolvesFrom', 'evolvesTo', 'weaknesses', 'convertedRetreatCost', 'resistances', 'cleaned_attacks', 'cleaned_abilities', 'cleaned_rules', 'tournamentYear']
export_df = combined_df[usecols]
export_df.loc[:, card_ids] = card_synergies

In [36]:
export_df.to_csv(f'output/{rotation_name}_synergies.csv', index=False)

In [37]:
n = 5
top_n = np.argsort(card_matrix.sum(axis=1))[-5:]
for i in top_n:
    print(idx_to_card[i])

98-crobatv
Reset Stamp
Switch
Boss's Orders
Quick Ball
