# Making Alternate Datasets
- After training/test sets have been extracted using train.py, it might be necessary to build alternate datasets if data other than hours played are to be used in ML experiments
- By reading in the datasets and using the IDs, we can get corresponding data in the Mongo database

In [47]:
from util.mongodb import connect_to_db
from src.features import normalize_id
from collections import Counter
from json import loads

In [26]:
# Connect to Mongo DB
port = 27017
reviewdb = connect_to_db(port)

In [7]:
! ls -lh ../working/*jsonlines

-rw-rw-r-- 1 mulhollm mulhollm 187M Sep 14 03:41 ../working/Arma_3.jsonlines
-rw-r--r-- 1 mulhollm mulhollm 116M Sep 14 03:38 ../working/Arma_3.test.jsonlines
-rw-rw-r-- 1 mulhollm mulhollm 161M Sep 14 03:41 ../working/Counter_Strike_Global_Offensive.jsonlines
-rw-r--r-- 1 mulhollm mulhollm 104M Sep 14 03:38 ../working/Counter_Strike_Global_Offensive.test.jsonlines
-rw-r--r-- 1 mulhollm mulhollm 2.0G Sep 14 03:44 ../working/Counter_Strike.jsonlines
-rw-r--r-- 1 mulhollm mulhollm 148M Sep 14 03:39 ../working/Counter_Strike.test.jsonlines
-rw-rw-r-- 1 mulhollm mulhollm 161M Sep 14 03:44 ../working/Dota_2.jsonlines
-rw-r--r-- 1 mulhollm mulhollm 170M Sep 14 03:39 ../working/Dota_2.test.jsonlines
-rw-rw-r-- 1 mulhollm mulhollm 169M Sep 14 03:45 ../working/Garrys_Mod.jsonlines
-rw-r--r-- 1 mulhollm mulhollm 181M Sep 14 03:39 ../working/Garrys_Mod.test.jsonlines
-rw-r--r-- 1 mulhollm mulhollm 172M Sep 14 03:45 ../working/Grand_Theft_Auto_V.jsonlines
-rw-r--r-- 1 mulhollm mulhollm 

In [29]:
# Let's work with the data in a relatively small dataset, the test set for
# Team Fortress 2
data = [loads(line) for line in open('../working/Team_Fortress_2.test.jsonlines')]

In [30]:
data[0].keys()

dict_keys(['id', 'y', 'x'])

In [43]:
len(data)

1550

In [48]:
normalized_ids_counter = Counter([_data['id'] for _data in data])

In [51]:
duplicates = [_id for _id in normalized_ids_counter if normalized_ids_counter[_id] > 1]

In [55]:
duplicate_1_id = duplicates[0]
duplicate_1_id

'5889414799363945755'

In [57]:
duplicate_1 = [_data for _data in data if _data['id'] == duplicate_1_id]

In [59]:
duplicate_1[0] == duplicate_1[1]

True

In [60]:
normalized_ids_data_dict = {}
for _data in data:
    _id = _data['id']
    # Don't add an entry if this sample's ID is already accounted for
    if _id in normalized_ids_data_dict.keys():
        continue
    normalized_ids_data_dict[_id] = {'data': _data,
                                     'db_data': None}

In [62]:
list(normalized_ids_data_dict.keys())[:5]

['4791911353521639065',
 '5927141028353036122',
 '5348493173370030504',
 '7500891916612766495',
 '4438889256055680651']

In [63]:
normalized_ids_data_dict['4791911353521639065']

{'data': {'id': '4791911353521639065',
  'x': {'chips ,': 1,
   'invisible': 1,
   'up': 1,
   'ts sa': 1,
   'uses': 1,
   'arn': 1,
   'snip': 1,
   'GO ': 1,
   'destroy these': 1,
   'at st': 1,
   'mos': 1,
   'do:ROOT:do': 1,
   'the bubbles': 1,
   'shot in': 1,
   'ORE t': 1,
   'plain': 1,
   'like trading': 1,
   '! If': 1,
   'well ': 1,
   ' is A': 1,
   ' get ': 1,
   'in b': 1,
   's int': 1,
   's g': 1,
   'uced': 1,
   'brothers cartoons': 1,
   'and valuable': 1,
   'ed, c': 1,
   'n Ba': 1,
   'd FP': 1,
   'ust ': 1,
   "people 's": 1,
   'T imp': 1,
   'an ge': 1,
   "'s cartoonesque": 1,
   ' e': 1,
   'to backstab': 1,
   'intutwine': 1,
   'o by ': 1,
   'hat s': 1,
   '~10/10': 1,
   'to stop': 1,
   '10/': 1,
   'is enough': 1,
   'when this': 1,
   'ay to': 1,
   'her': 1,
   'hin': 1,
   '.... but': 1,
   'shooters ,': 1,
   'miniguns': 1,
   'break the': 1,
   'tr': 1,
   'if:VMOD:have': 1,
   'eamwo': 1,
   'upgrade his': 1,
   'played': 1,
   'rarely': 1,

In [64]:
len(normalized_ids_data_dict)

1479

In [72]:
# Let's create a cursor over all Team Fortress 2 documents
game = 'Team_Fortress_2'
partition = 'test'
cursor = reviewdb.find({'game': game})

In [73]:
# How many samples are there for Team Fortress 2 (all partitions combined)?
cursor.count()

3757

In [74]:
# Example document
sample = next(cursor)
sample

{'_id': ObjectId('55c58c39c134cf069a422b0e'),
 'achievement_progress': {'num_achievements_attained': 0,
  'num_achievements_percentage': 0.0,
  'num_achievements_possible': 28},
 'appid': '440',
 'bin_factor': 1.5,
 'bin_ranges': [[0.0, 845.5],
  [845.6, 2113.8],
  [2113.9, 4016.3],
  [4016.4, 6870.0]],
 'binarized': True,
 'date_posted': 'Jun 29, 2014, 10:32PM',
 'date_updated': None,
 'features': '{" sp": 1, "another": 1, "at-b": 1, "d spe": 1, "d a": 1, "hour:OBJ:100+": 1, "ting": 1, " anot": 1, "nd": 1, "d da": 1, "mu": 1, "in": 1, "urs ": 1, " simu": 1, "ed ": 1, "er 1": 1, " hou": 1, "ato": 1, "ulato": 1, "simul": 1, "g s": 1, "hat-b": 1, "ther ": 1, "other": 1, " pl": 1, "100+": 1, "s pla": 1, "nd a": 1, "ng ": 1, "cluster11242": 1, "or ": 1, "ati": 1, " wo": 1, "ur": 1, "ould": 1, "ay": 1, "0+": 1, "anoth": 1, "zeroes_repvecs": 3, "d ano": 1, "ano": 1, "er ": 1, "rs pl": 1, "+ ": 1, "nd an": 1, "or": 1, "da": 1, "ot": 1, "r wo": 1, "spe": 1, "cluster28650": 1, "spend:VC:hour": 

In [75]:
sample['_id']

ObjectId('55c58c39c134cf069a422b0e')

In [76]:
str(abs(hash(str(sample['_id']))))

'1909195635508261055'

In [77]:
str(abs(hash(str(sample['_id']))))

'1909195635508261055'

In [69]:
# Let's iterate through the documents
not_found = []
for sample in cursor:
    _id = sample.get('_id')
    normalized_id = normalize_id(_id)
    if normalized_ids_data_dict.get(normalize_id):
        _data = dict(normalized_ids_data_dict[normalized_id])
        _data['db_data'] = dict(sample)
        normalized_ids_data_dict[normalized_id] = _data
    else:
        not_found.append((_id,
                          normalized_id))

In [71]:
not_found[:100]

[(ObjectId('55c58c39c134cf069a422b0e'), '1909195635508261055'),
 (ObjectId('55c58c39c134cf069a4224e4'), '4633408371632359236'),
 (ObjectId('55c58c38c134cf069a42223c'), '7630855974652181728'),
 (ObjectId('55c58c38c134cf069a421ef6'), '1664353644504832285'),
 (ObjectId('55c58c38c134cf069a421ef8'), '5720264909250722215'),
 (ObjectId('55c58c38c134cf069a421f96'), '2045903905335424682'),
 (ObjectId('55c58c39c134cf069a422b10'), '210350313556682255'),
 (ObjectId('55c58c39c134cf069a422b12'), '1489121607281958827'),
 (ObjectId('55c58c38c134cf069a422261'), '5819740434479488346'),
 (ObjectId('55c58c38c134cf069a422264'), '8708389093215260765'),
 (ObjectId('55c58c39c134cf069a422adf'), '7303174278720180454'),
 (ObjectId('55c58c39c134cf069a4224e3'), '5789688444339064255'),
 (ObjectId('55c58c39c134cf069a422648'), '6710538697226985439'),
 (ObjectId('55c58c39c134cf069a4226b1'), '2503376310676942627'),
 (ObjectId('55c58c39c134cf069a4226b2'), '6876445705561660817'),
 (ObjectId('55c58c39c134cf069a4226b3'), '