In [1]:
# configure auto-reload and add libs to path
%load_ext autoreload
%autoreload 2

import os, sys
fastai_lib_path = os.path.abspath('../../..')
if fastai_lib_path not in sys.path: sys.path.append(fastai_lib_path)

In [2]:
# core imports
import json, requests

from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from sklearn import metrics

In [3]:
# pandas and plotting config
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

%matplotlib inline

Configure path and TBA Read API key (which is required to use v3 of their API).

In [4]:
PATH = 'data'

comp_yr = 2017

tba_key = 'J9XyDSN69eZMHqanEDaevzAywDjxL9iyBYAQh2erLJJ9MtZVG60HsEYeKVwloFGe'
tba_header = { 'X-TBA-Auth-Key': tba_key }
tba_url = 'https://thebluealliance.com/api/v3'

team_match_scores_path = f'{PATH}/{comp_yr}-team-match-scores-raw.csv'
team_event_stats_path = f'{PATH}/{comp_yr}-team-event-stats-raw.csv'
scouting_reports_path = f'{PATH}/{comp_yr}-scouting-reports-raw.csv'

The goal is to prepare a dataset for predicting various individual team scores, so we will pull all scouting reports for the given `comp_yr`, and then for each scouted team, we will pull all their match data for the same year.

## 1. Get all SCOUTING REPORTS for the competition year

In [5]:
scouting_file_name = '2017-rr-scouting-reports.json'
scouting_file = f'{PATH}/{scouting_file_name}'

In [6]:
with open(scouting_file) as f: scouting_data = json.load(f)

In [7]:
# print(json.dumps(scouting_data, indent=2, sort_keys=True))

In [8]:
# load all available scouting reports for current comp_yr
scouting_reports = []

for event_key, event_data in scouting_data.items():
    # print(event_key)
    # only load ratings for current comp_yr
    if (event_key[:4] != str(comp_yr)): continue
        
    for rated_by_team_key, team_scouting_data in event_data.items():
        # print(rated_by_team_key)
        scouting_reports += [ v for k,v in team_scouting_data.items() ]

In [9]:
scouting_reports_df = pd.DataFrame(scouting_reports)
scouting_reports_df.head()

Unnamed: 0,asdf,dddd,event_id,rating_overall_gear_efficiency,rating_overall_gear_efficiency_auto,rating_overall_gear_placement_auto,rating_overall_pilot_competency,rating_overall_robot_stability,rating_scoring_airship_climb,rating_scoring_base_line_made_auto,rating_scoring_gears_made,rating_scoring_gears_made_auto,rating_scoring_high_goals_made,rating_scoring_high_goals_made_auto,rating_scoring_low_goals_made,rating_scoring_low_goals_made_auto,scored_at,scored_by,team_id
0,,,2017cada,,,,,,,,0,0,0,0,0,0,1490371000000.0,"{'email': 'tylerselinka1@gmail.com', 'name': '...",frc3013
1,,,2017cada,2.0,,0.0,2.0,2.0,0.0,0.0,0,0,0,0,0,0,1490371000000.0,"{'email': 'connorrocker@gmail.com', 'name': 'C...",frc2551
2,,,2017cada,2.0,,0.0,2.0,2.0,0.0,0.0,1,0,0,0,0,0,1490371000000.0,"{'email': 'brian.titcomb27@gmail.com', 'name':...",frc692
3,,,2017cada,4.0,,0.0,,4.0,1.0,,3,0,0,0,0,0,1490371000000.0,"{'email': 'seanlinden7934@gmail.com', 'name': ...",frc1323
4,,,2017cada,2.0,,0.0,4.0,3.0,0.0,0.0,1,0,0,0,0,0,1490371000000.0,"{'email': 'sca2019@gmail.com', 'name': 'Seth A...",frc4698


In [10]:
scouting_reports_df.to_csv(scouting_reports_path, index=False)

## 2. For each team, get their match scores/stats for every event played in comp. year

In [11]:
team_keys = scouting_reports_df.team_id.unique().astype('str')
# team_keys

In [12]:
# build a list of valid events for ml
valid_events = {}
r = requests.get(f'{tba_url}/events/{comp_yr}/simple', headers=tba_header)

if (r.status_code == 200):
    events = r.json()
    
    # ignore offseason, preseason, unlabled
    valid_events = { e['key']: e for e in events if e['event_type'] not in [99, 100, -1] }

In [13]:
# valid_events

### 2.1. Match Scores

In [14]:
# for each team, get all their match scores for the year
team_matches = []

# keep track of playoff stats for each team as well; include in overall event/team stats below
team_playoff_stats = {}

for team_key in team_keys:
    if (not team_key.startswith('frc')): continue
        
    r = requests.get(f'{tba_url}/team/{team_key}/matches/{comp_yr}', headers=tba_header)
    if (r.status_code != 200): continue
        
    matches = r.json()
    for match in matches:
        # check for valid event
        event = valid_events[match['event_key']] if match['event_key'] in valid_events else None
        if (event == None): continue
            
        # if there are no scores for a match, it isn't helpful for ml
        if (match['score_breakdown'] == None): continue
        
        alliance = 'blue' if (team_key in match['alliances']['blue']['team_keys']) else 'red'
        alliance_team_keys = [ k for k in match['alliances'][alliance]['team_keys'] if (k != team_key) ]
        opp_alliance = 'blue' if (alliance == 'red') else 'red'
        
        # add team match data
        team_match = {
            'year': comp_yr,
            'event_key': match['event_key'],
            'event_code': event['event_code'],
            'event_name': event['name'],
            'event_type': event['event_type'],
            'event_start_date': event['start_date'],
            'event_end_date': event['end_date'],
            'event_district': event['district'],
            'event_city': event['city'],
            'event_state_prov': event['state_prov'],
            'event_country': event['country'],
            'match_key': match['key'],
            'team_key': team_key,
            'match_number': match['match_number'],
            'set_number': match['set_number'],
            'comp_level': match['comp_level'],
            'time': match['time'],
            'actual_time': match['actual_time'],
            'predicted_time': match['predicted_time'],
            'post_result_time': match['post_result_time'],
            'score': match['alliances'][alliance]['score'],
            'is_winner': 1 if (match['winning_alliance'] == alliance) else 0,
            'alliance': alliance,
            'alliance_team_keys': alliance_team_keys,
            'winning_margin': match['alliances'][alliance]['score'] - match['alliances'][opp_alliance]['score']
        }
        
        # include score breakdown
        team_match.update(match['score_breakdown'][alliance])

        team_matches.append(team_match)
        
        # add playoff stats data
        team_event_key = team_match['team_key'] + '_' + team_match['event_key']
        
        if (team_event_key not in team_playoff_stats):
            team_playoff_stats[team_event_key] = {
                'team_key': team_match['team_key'],
                'event_key': team_match['event_key'],
                'is_playoff_team': 0,
                'is_finals_team': 0,
                'post_qual_wins': 0
            }
            
        # comp_level = [ qm, ef, qf, sf, f ]
        if (team_match['comp_level'] != 'qm'): 
            team_playoff_stats[team_event_key]['is_playoff_team'] = 1
            
            if (team_match['is_winner'] == 1):
                team_playoff_stats[team_event_key]['post_qual_wins'] += 1
                
            if (team_match['comp_level'] == 'f'): team_playoff_stats[team_event_key]['is_finals_team'] = 1
        
# sort team matches by startdate (earliest to most recent)
team_matches = sorted(team_matches, key=lambda x: x['time'] if x['actual_time'] == None else x['actual_time'])

In [15]:
# take a look at the first 10 team matches 
# print(json.dumps(team_matches[:10], indent=2, sort_keys=True))

In [16]:
# load into dataframe
matches_df = pd.DataFrame(team_matches)
matches_df.head()

Unnamed: 0,actual_time,adjustPoints,alliance,alliance_team_keys,autoFuelHigh,autoFuelLow,autoFuelPoints,autoMobilityPoints,autoPoints,autoRotorPoints,comp_level,event_city,event_code,event_country,event_district,event_end_date,event_key,event_name,event_start_date,event_state_prov,event_type,foulCount,foulPoints,is_winner,kPaBonusPoints,kPaRankingPointAchieved,match_key,match_number,post_result_time,predicted_time,robot1Auto,robot2Auto,robot3Auto,rotor1Auto,rotor1Engaged,rotor2Auto,rotor2Engaged,rotor3Engaged,rotor4Engaged,rotorBonusPoints,rotorRankingPointAchieved,score,set_number,tba_rpEarned,team_key,techFoulCount,teleopFuelHigh,teleopFuelLow,teleopFuelPoints,teleopPoints,teleopRotorPoints,teleopTakeoffPoints,time,totalPoints,touchpadFar,touchpadMiddle,touchpadNear,winning_margin,year
0,1488551305,0,blue,"[frc3651, frc6693]",0,0,0,5,5,0,qm,Myrtle Beach,scmb,USA,,2017-03-04,2017scmb,Palmetto Regional,2017-03-01,SC,0,0,30,1,0,False,2017scmb_qm4,4,1488552000.0,1488552000.0,Mobility,,,False,True,False,False,False,False,0,False,75,1,2.0,frc1287,0,0,0,0,40,40,0,1488551040,75,,,,30,2017
1,1488551818,0,red,"[frc3653, frc2152]",0,0,0,15,15,0,qm,West Palm Beach,flwp,USA,,2017-03-04,2017flwp,South Florida Regional,2017-03-01,FL,0,0,0,1,0,False,2017flwp_qm1,1,1488552000.0,1488552000.0,Mobility,Mobility,Mobility,False,True,False,False,False,False,0,False,155,1,2.0,frc6038,0,2,0,0,140,40,100,1488551400,155,,ReadyForTakeoff,ReadyForTakeoff,44,2017
2,1488552377,0,blue,"[frc5196, frc6388]",0,0,0,0,0,0,qm,West Palm Beach,flwp,USA,,2017-03-04,2017flwp,South Florida Regional,2017-03-01,FL,0,0,0,0,0,False,2017flwp_qm2,2,1488553000.0,1488553000.0,,,,False,True,False,True,False,False,0,False,80,1,0.0,frc263,0,0,0,0,80,80,0,1488551820,80,,,,-45,2017
3,1488552377,0,red,"[frc694, frc2641]",0,0,0,5,5,0,qm,West Palm Beach,flwp,USA,,2017-03-04,2017flwp,South Florida Regional,2017-03-01,FL,0,0,0,1,0,False,2017flwp_qm2,2,1488553000.0,1488553000.0,Mobility,,,False,True,False,True,True,False,0,False,125,1,2.0,frc1523,0,0,0,0,120,120,0,1488551820,125,,,,45,2017
4,1488553596,0,blue,"[frc4823, frc6366]",0,0,0,10,10,0,qm,Myrtle Beach,scmb,USA,,2017-03-04,2017scmb,Palmetto Regional,2017-03-01,SC,0,0,25,1,0,False,2017scmb_qm8,8,1488554000.0,1488554000.0,,Mobility,Mobility,False,True,False,True,False,False,0,False,165,1,2.0,frc4451,0,0,0,0,130,80,50,1488552960,165,,,ReadyForTakeoff,10,2017


In [17]:
# save raw file to csv
matches_df.to_csv(team_match_scores_path, index=False)

### 2.2. Event Stats

In [18]:
event_keys = matches_df.event_key.unique()
# event_keys

In [19]:
# for each team, get their oprs and ranking stats for each event they participated in
team_event_stats = []

# oprs, ccwms, dprs
team_event_oprs = {}
for event_key in event_keys:

    r = requests.get(f'{tba_url}/event/{event_key}/oprs', headers=tba_header)
    
    if (r.status_code != 200): continue
    stats_data = r.json()
    
    if (stats_data == None): continue

    for tk in team_keys:
        if (tk in stats_data['oprs']):
            team_event_oprs[event_key + '_' + tk] = {
                'oprs': stats_data['oprs'][tk],
                'dprs': stats_data['dprs'][tk],
                'ccwms': stats_data['ccwms'][tk],
            }
    
# rankings data
for event_key in event_keys:
         
    r = requests.get(f'{tba_url}/event/{event_key}/rankings', headers=tba_header)
    
    if (r.status_code != 200): continue
    rankings_data = r.json()
    
    extra_stats_info = rankings_data['extra_stats_info'] # an array of dictionaires (keys = name, percision) 
    sort_order_info = rankings_data['sort_order_info']   # an array of dictionaires (keys = name, percision) 
    
    for ranking in rankings_data['rankings']:
        if (ranking['team_key'] in team_keys):
            ranking_data = {
                'event_key': event_key,
                'team_key': ranking['team_key'],
                'matches_played': ranking['matches_played'],
                'qual_average': ranking['qual_average'],
                'rank': ranking['rank'],
                'losses': ranking['record']['losses'],
                'ties': ranking['record']['ties'],
                'wins': ranking['record']['wins']
            }
            
            for idx,el in enumerate(sort_order_info):
                ranking_data[el['name']] = ranking['sort_orders'][idx]
                
            ranking_data.update(team_event_oprs[event_key + '_' + ranking['team_key']])
    
            team_event_stats.append(ranking_data)
        
# additional stats
for r in team_event_stats:
    r['start_date'] = valid_events[r['event_key']]['start_date']
    
    k = r['team_key'] + '_' + r['event_key']
    if (k in team_playoff_stats):
        r['is_playoff_team'] = team_playoff_stats[k]['is_playoff_team']
        r['is_finals_team'] = team_playoff_stats[k]['is_finals_team']
        r['post_qual_wins'] = team_playoff_stats[k]['post_qual_wins']

In [23]:
# take a look at the first 10 stas
# print(json.dumps(team_event_stats[:10], indent=2, sort_keys=True))

In [21]:
# load into dataframe
team_event_stats_df = pd.DataFrame(team_event_stats)
team_event_stats_df.head()

Unnamed: 0,Auto,Match Points,Pressure,Ranking Score,Rotor,Touchpad,ccwms,dprs,event_key,is_finals_team,is_playoff_team,losses,matches_played,oprs,post_qual_wins,qual_average,rank,start_date,team_key,ties,wins
0,339.0,1536.0,6.0,1.5,800.0,550.0,34.35654,44.906465,2017scmb,1,1,2,8,79.263005,5,,6,2017-03-01,frc1287,0,6
1,165.0,1513.0,3.0,1.37,780.0,600.0,31.595175,64.709658,2017scmb,0,1,2,8,96.304833,1,,11,2017-03-01,frc4451,1,5
2,228.0,1453.0,8.0,1.25,760.0,550.0,23.209025,55.120927,2017scmb,1,1,3,8,78.329953,5,,17,2017-03-01,frc283,0,5
3,335.0,1974.0,24.0,1.6,1040.0,700.0,48.955095,48.207602,2017flwp,1,1,2,10,97.162697,5,,3,2017-03-01,frc1523,0,8
4,326.0,1793.0,23.0,1.2,1040.0,550.0,24.413285,52.241866,2017flwp,0,1,4,10,76.655151,2,,15,2017-03-01,frc263,0,6


In [22]:
# save raw file to csv
team_event_stats_df.to_csv(team_event_stats_path, index=False)