/
data.py
132 lines (102 loc) · 5.5 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pandas as pd
from pandas.io.json import json_normalize
import requests
import gzip
import shutil
import os
import json
apikey = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqdGkiOiIyY2IxM2ZhMC0zYzQ1LTAxMzctODYwNC0wMWI0YTFkMWRiOWEiLCJpc3MiOiJnYW1lbG9ja2VyIiwiaWF0IjoxNTU0NzM5MTUxLCJwdWIiOiJibHVlaG9sZSIsInRpdGxlIjoicHViZyIsImFwcCI6Ii02MTA3ZjhlYS1kZjNlLTQzYTctYTU5Ni1hMzljYTY5NDQ2NzUifQ.Aqxif7hL74A6LxrnpO7LNZYMCACAf4HHYTatEs3UWYs"
# Get a list of random match IDs from the PUBG API
def get_matches():
# Header for auth
header = {"Authorization" : "Bearer %s" % apikey,
"Accept" : "application/vnd.api+json"}
# Get the API call data
apireq = requests.get("https://api.pubg.com/shards/steam/samples", headers=header)
# Dump json data into pandas for better formatting
samples_df = pd.DataFrame(apireq.json())
# Extract just the match data
samples_norm = json_normalize(samples_df.loc['relationships'])
matches = samples_norm['matches.data'].apply(pd.Series).T[0].apply(pd.Series)
return matches['id']
# Given a matchID, get the 'data' dict and 'included' array from the matches endpoint API request
# API response is structured: { data : {...},
# included: [...],
# links [...],
# meta {}}
# where 'data' has information about the IDs of the match objects (players, rosters, etc) and
# 'included' contains links to the actual objects about the match (look up by ID from data)
# 'links' and 'meta' can be ignored for now
def get_match_info(matchID):
header = {"accept": "application/vnd.api+json"}
apireq = requests.get("https://api.pubg.com/shards/steam/matches/%s" % matchID, headers=header)
# get the data
match_data = pd.DataFrame(apireq.json()['data'])
match_included = pd.DataFrame(apireq.json()['included'])
return match_data, match_included
# Extract the url of the telemetry json file from 'data' and 'included' returned from the matches endpoint query
def get_telemetry(match_data, match_included):
# Get the id of the telemetry object to look up in the included array
assets = pd.DataFrame(match_data.loc['assets', 'relationships'])['data'].apply(pd.Series)
telemetry_id = assets[assets.type == "asset"]['id'].values[0]
# Get the telemetry object from the included array
telemetry_object = match_included[match_included.id == telemetry_id]
# Return the url of the telemetry object
return telemetry_object['attributes'].apply(pd.Series)['URL']
# Extract the statistics for the winning players from 'data' and 'included' returned from the matches endpoint query
def get_winner_stats(match_data, match_included):
# Get the roster from the 'data' dict (has the IDs of the objects in the included array)
# and combine with the actual data for the players from the 'included' array
rosters = pd.DataFrame(match_data.loc['rosters', 'relationships'])['data'].apply(pd.Series)
rosters_included = match_included[match_included.type == 'roster']
combined = rosters_included.merge(rosters, on='id')
combined = pd.concat([combined.drop(['attributes', 'relationships'], axis=1),
combined['attributes'].apply(pd.Series),
combined['relationships'].apply(pd.Series)], axis=1)
# Get the winners
winning_team = combined[combined.won == 'true']
winning_participants = json_normalize(winning_team['participants'].values[0], record_path='data')
# Now get the data (stats, etc) related to the winning players from the included array
winning_participants_data = match_included[match_included.id.isin(winning_participants.id)]
winner_data_all = pd.concat([winning_participants_data.drop('attributes', axis=1),
winning_participants_data['attributes'].apply(pd.Series)['stats'].apply(pd.Series)], axis=1)
return winner_data_all
# Given location of gzips and location of where to extract to -> unzips gzip files
def extract_gzip(gzip_indir, gzip_outdir):
# Check if directories exist
if not os.path.isdir(gzip_indir):
print("Cannot find directory '" + gzip_indir + "'")
exit()
elif not os.path.isdir(gzip_outdir):
print("Cannot find directory '" + gzip_outdir + "'")
exit()
# Get name of all files to unzip
files = os.listdir(gzip_indir)
for f in files:
outfile = f.replace(".gz", "")
try:
with gzip.open(gzip_indir + f, 'rb') as g:
print("Copying '" + f + "'")
# Copy files to output directory
with open(gzip_outdir + outfile, 'wb') as g_copy:
shutil.copyfileobj(g, g_copy)
except:
print("Unable to copy file '" + gzip_indir + f + "'\n")
print("Copying data - done")
#tel_file (json file path) -> pandas.DataFrame
#Returns a dataframe of logs on parachute landing
def telemetry_pland(tel_file):
tel_df = None
drop_events = []
with open(tel_file) as f:
json_file = json.load(f)
for entry in json_file:
if 'LogParachuteLanding' == entry['_T']:
drop_events.append(entry)
drop_table = json_normalize(drop_events)
return drop_table
if __name__ == "__main__":
matches = get_matches()
data, included = get_match_info(matches[0])
winners = get_winner_stats(data, included)
print(winners)