In [1]:
import pandas as pd
import numpy as np
import requests
import json

In [2]:
#make a directory to put the data in
from pathlib import Path
Path("./data").mkdir(parents=True, exist_ok=True)

In [3]:
#get json file of cards from netrunnerdb
f =  open('./data/cards','w')
r =  requests.get('https://netrunnerdb.com/api/2.0/public/cards')
f.write(r.text)
f.close()

In [4]:
# make a dataframe of cards
cards = pd.read_json (r'./data/cards')
cards = pd.json_normalize(cards['data'].values)

#fix how the cards are numbered to match the decks database "cards.#####"
cards['code'] = 'cards.'+cards['code']
#Remove a card that does not appear in any decks from the dataframe of cards
cards = cards[cards['code'] != 'cards.00012']

In [5]:
#Make a dataframe of corp cards and a dataframe of runner cards
corp_cards = cards[cards['side_code']=='corp']
runner_cards = cards[cards['side_code']=='runner']

In [6]:
#get a json file of decks from netrunnerdb. We can only request decks by date published
#This can take a while (like 30 min)

f =  open('./data/decks','w')

#make list of dates
date1 = '2014-08-27'
date2 = '2020-06-07'
dates = pd.date_range(date1, date2).strftime('%Y-%m-%d')

#we need to make a new json object containing each dates decklists json object
f.write('{')

#Write out into file 
for day in dates:
    r =  requests.get('https://netrunnerdb.com/api/2.0/public/decklists/by_date/'+day)
    f.write('"'+day+'": ['+r.text)
    f.write('],')

#last one hardcoded so there is not comma at the end...
lastday='2020-06-08'
r =  requests.get('https://netrunnerdb.com/api/2.0/public/decklists/by_date/'+lastday)
f.write('"'+lastday+'": ['+r.text)
f.write(']')

#end the json object and close the file
f.write('}')

f.close()

In [7]:
# make a dataframe of decks
df =  pd.read_json(r'./data/decks')
deck_list = pd.concat([pd.json_normalize(x)['data'] for x in df.values])
mask = deck_list.map(lambda d: len(d)) > 0
deck_list = deck_list[mask]
decks = pd.concat([pd.json_normalize(x) for x in deck_list.values],sort=True,ignore_index=True)

In [8]:
#drop the features that we don't care about
decks = decks.drop(['date_update','mwl_code','description','name','tournament_badge','user_id','user_name'],axis=1)

#fill all the na values for missing cards with 0's
decks = decks.fillna(0)

#turn all of the deck numbers in np.int8 to reduce file size
decks_float = decks.select_dtypes(include=['float'])
converted = decks_float.astype(np.int8)
decks[converted.columns] = converted

In [9]:
#make list of idenity cards
identity_cards = cards[cards['type_code'] == 'identity']

#make two new features keep track of the idenity cards and the faction
for ident in identity_cards['code']:
    decks.loc[decks[ident]==1,'identity_card']=ident
    decks.loc[decks[ident]==1,'faction']= cards[cards['code']==ident]['faction_code'].values[0]

#delete old columns that corresponded to identiy cards
decks = decks.drop(identity_cards['code'],axis=1)

#remove decks with nan as the faction
decks = decks[~decks['faction'].isna()]

In [10]:
#clean up the last few columns types
decks['id']=pd.to_numeric(decks['id'], downcast='integer')
decks['identity_card'] = decks['identity_card'].astype('string')
decks['faction'] = decks['faction'].astype('string')
decks['date_creation'] = pd.to_datetime(decks['date_creation'])

In [11]:
#now I want to remove decks that are not really decks...the non-identity ones to start off
decks = decks[decks['faction'] != 'neutral-runner']
decks = decks[decks['faction'] != 'neutral-corp']

#now remove the draft format decks
draft_cards = ['cards.00005','cards.00006','cards.00007','cards.00008','cards.00009','cards.00010','cards.00011','cards.000013']
decks = decks[~decks['identity_card'].isin(draft_cards)]

In [12]:
#Let's split the decks into corp and runner decks since these are really disjoint data sets
corps = ['weyland-consortium','nbn','jinteki','haas-bioroid']
runners = ['shaper','criminal','anarch','apex','sunny-lebeau','adam']
corp_decks = decks[decks['faction'].isin(corps)]
runner_decks = decks[decks['faction'].isin(runners)]

#TODO remove all the extra columns and rows from the corp and runner deck
#since they only use half the card pool.

In [13]:
#make a list of runner and corp cards in the current deck list
mod_runner_cards = runner_cards[runner_cards['code'].isin(decks.keys())]
mod_corp_cards = corp_cards[corp_cards['code'].isin(decks.keys())]

#then drop the runner and corp card columns
corp_decks = corp_decks.drop(list(mod_runner_cards['code']),axis=1)
runner_decks = runner_decks.drop(list(mod_corp_cards['code']),axis=1)

In [14]:
#pickle everthing
#cards.to_pickle('./data/cards_processed.pkl')
corp_cards.to_pickle('./data/corp_cards_processed.pkl')
runner_cards.to_pickle('./data/runner_cards_processed.pkl')
#decks.to_pickle('./data/decks_processed.pkl')
corp_decks.to_pickle('./data/corp_decks_processed.pkl')
runner_decks.to_pickle('./data/runner_decks_processed.pkl')