/
__init__.py
executable file
·439 lines (379 loc) · 18.2 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
import logging
from os import listdir
from os.path import (join,
exists,
dirname,
realpath)
from re import compile as recompile
from typing import (Any,
Dict,
List,
Union,
Tuple,
TypeVar,
Callable,
Sequence,
Optional)
from sklearn.cluster import MiniBatchKMeans
from sklearn.naive_bayes import (BernoulliNB,
MultinomialNB)
from sklearn.feature_extraction import (FeatureHasher,
DictVectorizer)
from sklearn.linear_model import (Perceptron,
SGDClassifier,
PassiveAggressiveRegressor,
PassiveAggressiveClassifier)
from data import APPID_DICT
project_dir = dirname(dirname(realpath(__file__)))
data_dir = join(project_dir, 'data')
log_dir = join(project_dir, 'logs')
reports_dir = join(project_dir, 'reports')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# Type aliases
BinRanges = List[Tuple[float, float]]
Learner = Union[Perceptron,
MiniBatchKMeans,
BernoulliNB,
MultinomialNB,
SGDClassifier,
PassiveAggressiveRegressor,
PassiveAggressiveClassifier]
ParamGrid = List[Dict[str, List[Any]]]
Vectorizer = Union[DictVectorizer, FeatureHasher]
Numeric = Union[int, float]
Generic = TypeVar('Generic')
ScoringFunction = Callable[[Sequence[Numeric],
Sequence[Numeric]],
Numeric]
Scorer = Optional[Union[str, ScoringFunction]]
# Seed for random state
SEED = 123456789
# Define default parameter grids
DEFAULT_PARAM_GRIDS = \
{MiniBatchKMeans: [{'n_clusters': [3, 5, 10],
'init' : ['k-means++', 'random'],
'random_state': [SEED]}],
BernoulliNB: [{'alpha': [0.1, 0.25, 0.5],
'fit_prior': [True, False]}],
MultinomialNB: [{'alpha': [0.1, 0.25, 0.5],
'fit_prior': [True, False]}],
Perceptron: [{'penalty': [None, 'l2', 'l1', 'elasticnet'],
'alpha': [0.0001, 0.001, 0.01, 0.1],
'n_iter': [5, 10],
'eta0': [0.5, 1, 2],
'random_state': [SEED]}],
PassiveAggressiveRegressor:
[{'C': [0.01, 0.1, 1.0, 10.0, 100.0],
'n_iter': [5, 10],
'random_state': [SEED],
'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive']}],
PassiveAggressiveClassifier:
[{'C': [0.01, 0.1, 1.0, 10.0, 100.0],
'n_iter': [5, 10],
'random_state': [SEED],
'loss': ['hinge', 'squared_hinge']}],
SGDClassifier: [{'loss': ['hinge', 'huber', 'log', 'epsilon_insensitive'],
'penalty': ['l1', 'l2', 'elasticnet'],
'n_iter': [5, 10],
'random_state': [SEED],
'eta0': [0.0, 0.001, 0.01, 0.1],
'alpha': [0.000001, 0.00001, 0.0001, 0.001],
'average': [True]}]}
# Learners
LEARNER_ABBRS_DICT = {'mbkm': 'MiniBatchKMeans',
'bnb': 'BernoulliNB',
'mnb': 'MultinomialNB',
'perc': 'Perceptron',
'pagr': 'PassiveAggressiveRegressor',
'pagc': 'PassiveAggressiveClassifier',
'sgdc': 'SGDClassifier'}
LEARNER_DICT_KEYS = frozenset(LEARNER_ABBRS_DICT.keys())
LEARNER_DICT = {k: eval(LEARNER_ABBRS_DICT[k]) for k in LEARNER_DICT_KEYS}
LEARNER_ABBRS_STRING = ', '.join(['"{0}" ({1})'.format(abbr, learner)
for abbr, learner in LEARNER_ABBRS_DICT.items()])
# Objective functions
OBJ_FUNC_ABBRS_DICT = {'pearson_r': "Pearson product-moment correlation",
'spearman': 'Spearman\'s rank correlation coefficient',
'kendall_tau': 'Kendall-Tau rank correlation coefficient',
'precision_macro': 'precision (macro)',
'precision_weighted': 'precision (weighted)',
'recall': "recall",
'f1_macro': 'f1 (macro)',
'f1_weighted': 'f1 (weighted)',
'f1_score_least_frequent': 'f1 (least frequent class)',
'accuracy': 'accuracy',
'uwk': 'unweighted kappa',
'uwk_off_by_one': 'unweighted kappa (off by one)',
'qwk': 'quadratic weighted kappa',
'qwk_off_by_one':
'quadratic weighted kappa (off by one)',
'lwk': 'linear weighted kappa',
'lwk_off_by_one': 'linear weighted kappa (off by one)'}
OBJ_FUNC_ABBRS_STRING = \
', '.join(['"{0}"{1}'
.format(abbr, ' ({0})'.format(func) if abbr != func else '')
for abbr, func in OBJ_FUNC_ABBRS_DICT.items()])
# Feature names
LABELS = frozenset({'num_guides', 'num_games_owned', 'num_friends',
'num_voted_helpfulness', 'num_groups',
'num_workshop_items', 'num_reviews', 'num_found_funny',
'friend_player_level', 'num_badges', 'num_found_helpful',
'num_screenshots', 'num_found_unhelpful',
'found_helpful_percentage', 'num_comments',
'total_game_hours', 'total_game_hours_bin',
'total_game_hours_last_two_weeks',
'num_achievements_percentage', 'num_achievements_attained',
'num_achievements_possible'})
LABELS_STRING = ', '.join(LABELS)
TIME_LABELS = frozenset({'total_game_hours', 'total_game_hours_bin',
'total_game_hours_last_two_weeks'})
FRIENDS_LABELS = frozenset({'num_friends', 'friend_player_level'})
HELPFUL_LABELS = frozenset({'num_voted_helpfulness', 'num_found_helpful',
'found_helpful_percentage', 'num_found_unhelpful'})
ACHIEVEMENTS_LABELS = frozenset({'num_achievements_percentage',
'num_achievements_attained',
'num_achievements_possible'})
LABELS_WITH_PCT_VALUES = frozenset({'num_achievements_percentage',
'found_helpful_percentage'})
# Valid games
VALID_GAMES = frozenset([game for game in list(APPID_DICT) if game != 'sample'])
# Regular expressions
FLOAT_ONE_DEC = recompile(r'^\-?\d+\.\d$')
test_float_decimal_places = FLOAT_ONE_DEC.search
# Data-scraping regular expressions
SPACE = recompile(r'[\s]+')
space_sub = SPACE.sub
BREAKS_REGEX = recompile(r'\<br\>')
breaks_sub = BREAKS_REGEX.sub
COMMA = recompile(r',')
comma_sub = COMMA.sub
HELPFUL_OR_FUNNY = recompile('(helpful|funny)')
helpful_or_funny_search = HELPFUL_OR_FUNNY.search
DATE_END_WITH_YEAR_STRING = recompile(r', \d{4}$')
date_end_with_year_string_search = DATE_END_WITH_YEAR_STRING.search
COMMENT_RE_1 = recompile(r'<span id="commentthread[^<]+')
comment_re_1_search = COMMENT_RE_1.search
COMMENT_RE_2 = recompile(r'>(\d*)$')
comment_re_2_search = COMMENT_RE_2.search
UNDERSCORE = recompile(r'_')
underscore_sub = UNDERSCORE.sub
QUOTES = recompile(r'\'|"')
quotes_sub = QUOTES.sub
BACKSLASH = recompile(r'\\')
backslash_sub = BACKSLASH.sub
def get_game_files(games_str: str, data_dir_path: str = data_dir) -> List[str]:
"""
Get list of game files (file-names only).
:param games_str: comma-separated list of game files (that exist in
the data directory) with or without a .jsonlines
suffix (or "all" for all game files) (Note: if
"sample"/"sample.jsonlines" is included it will be
filtered out)
:type games_str: str
:param data_dir_path: path to data directory (defaults to
`src.data_dir`)
:type data_dir_path: str
:returns: list of games
:rtype: list
:raises ValueError: no games were included in the list of games (or
`games_str` only includes
"sample"/"sample.jsonlines") or there are no
.jsonlines files in the data directory passed in
via `data_dir_path`
:raises FileNotFoundError: if file(s) corresponding to games in the
input cannot be found
"""
game_files = []
sample_file_inputs = ['sample', 'sample.jsonlines']
if not games_str or all(game in sample_file_inputs for game
in games_str.split(',')):
raise ValueError('Games string is empty or it includes only the sample'
' file (not intended for actual use): {}.'
.format(', '.join(games_str.split(','))))
elif games_str == "all":
game_files = [f for f in listdir(data_dir_path) if f.endswith('.jsonlines')]
# Remove the sample game file from the list if it exists
if any(game in game_files for game in sample_file_inputs):
for game in sample_file_inputs:
if game in game_files:
del game_files[game_files.index(game)]
if not game_files:
raise ValueError('No non-sample file .jsonlines files found in '
'"data_dir_path".')
else:
for f in games_str.split(','):
if f in sample_file_inputs:
continue
f = f if f.endswith('.jsonlines') else '{0}.jsonlines'.format(f)
f_path = join(data_dir_path, f)
if not exists(f_path):
raise FileNotFoundError('{0} does not exist (input string: '
'{1}).'.format(f_path, games_str))
game_files.append(f)
# Raise exception if the only file that was included was
# "sample"/"sample.jsonlines"
if not game_files:
raise ValueError('No non-sample file .jsonlines file was '
'included.')
return game_files
def find_default_param_grid(learner: str,
param_grids_dict: Dict[str, ParamGrid]
= DEFAULT_PARAM_GRIDS) -> ParamGrid:
"""
Finds the default parameter grid for the specified learner.
:param learner: abbreviated string representation of a learner
:type learner: str
:param param_grids_dict: dictionary of learner classes mapped to
parameter grids
:type param_grids_dict: dict
:returns: parameter grid
:rtype: dict
:raises ValueError: if an unrecognized learner abbreviation is used
"""
if not learner:
raise ValueError('Learner string is empty.')
if not param_grids_dict:
raise ValueError('Default parameter grids dictionary is empty. It '
'should consist of a dictionary mapping learner '
'classes to lists of parameter grids, each of which '
'is a list of dictionaries mapping parameters to '
'lists of parameter values.')
for key_cls, grids in param_grids_dict.items():
if issubclass(LEARNER_DICT[learner], key_cls):
return grids
raise ValueError('Unrecognized learner abbreviation: {0}'.format(learner))
def parse_learners_string(learners_string: str) -> set:
"""
Parse command-line argument consisting of a set of learners to
use (or the value "all" for all possible learners).
:param learners_string: comma-separated list of learner
abbreviations (or "all" for all possible
learners)
:type learners_string: str
:returns: set of learner abbreviations
:rtype: set
:raises ValueError: if unrecognized learner(s) are included in the
input
"""
if not learners_string:
raise ValueError('Learners string is empty.')
elif learners_string == 'all':
learners = set(LEARNER_DICT_KEYS)
else:
learners = set(learners_string.split(','))
if not learners.issubset(LEARNER_DICT_KEYS):
raise ValueError('Found unrecognized learner(s) in list of '
'passed-in learners: {0}. Available learners: {1}.'
.format(', '.join(learners), LEARNER_ABBRS_STRING))
return learners
def parse_non_nlp_features_string(features_string: str,
prediction_label: str) -> set:
"""
Parse command-line argument consisting of a set of non-NLP features
(or one of the values "all" or "none" for all or none of the
possible non-NLP features).
If the prediction label is one of the time features, take other
time features out of the set of non-NLP features since the
information could be duplicated.
:param features_string: comma-separated list of non-NLP features
(or "all"/"none" for all/none of the
possible non-NLP features)
:type features_string: str
:param prediction_label: the feature that is being predicted
:type prediction_label: str
:returns: set of non-NLP features to use
:rtype: set
:raises ValueError: if unrecognized features are found in the input
or there is a conflict between features
included in the input and the feature used as
the prediction label
"""
if not features_string:
raise ValueError('Features string is empty.')
if not prediction_label or not prediction_label in LABELS:
raise ValueError('Prediction label is empty or is an unrecognized '
'value.')
elif features_string == 'all':
non_nlp_features = set(LABELS)
# Remove time-related labels if the prediction label is also
# time-related
if prediction_label in TIME_LABELS:
[non_nlp_features.remove(label) for label in TIME_LABELS]
# Remove friends-related labels if the prediction label is also
# friends-related
elif prediction_label in FRIENDS_LABELS:
[non_nlp_features.remove(label) for label in FRIENDS_LABELS]
# Remove helpful-related labels if the prediction label is also
# helpful-related
elif prediction_label in HELPFUL_LABELS:
[non_nlp_features.remove(label) for label in HELPFUL_LABELS]
# Remove achievements-related labels if the prediction label is
# also achievements-related
elif prediction_label in ACHIEVEMENTS_LABELS:
[non_nlp_features.remove(label) for label in ACHIEVEMENTS_LABELS]
# Otherwise, just remove the prediction label
else:
non_nlp_features.remove(prediction_label)
elif features_string == 'none':
non_nlp_features = set()
else:
non_nlp_features = set(features_string.split(','))
# Raise an exception if unrecognized labels are found
if not non_nlp_features.issubset(LABELS):
raise ValueError('Found unrecognized feature(s) in the list of '
'passed-in non-NLP features: {0}. Available '
'features: {1}.'
.format(', '.join(non_nlp_features), LABELS_STRING))
# Raise an exception if there are conflicts between the
# prediction label and the set of non-NLP labels to use
if (prediction_label in TIME_LABELS
and non_nlp_features.intersection(TIME_LABELS)):
raise ValueError('The list of non-NLP features should not '
'contain any of the time-related features if '
'the prediction label is itself a '
'time-related feature.')
elif (prediction_label in FRIENDS_LABELS
and non_nlp_features.intersection(FRIENDS_LABELS)):
raise ValueError('The list of non-NLP features should not '
'contain any of the friends-related features if '
'the prediction label is itself a '
'friends-related feature.')
elif (prediction_label in HELPFUL_LABELS
and non_nlp_features.intersection(HELPFUL_LABELS)):
raise ValueError('The list of non-NLP features should not '
'contain any of the helpful-related features if '
'the prediction label is itself a '
'helpful-related feature.')
elif (prediction_label in ACHIEVEMENTS_LABELS
and non_nlp_features.intersection(ACHIEVEMENTS_LABELS)):
raise ValueError('The list of non-NLP features should not '
'contain any of the achievements-related '
'features if the prediction label is itself an '
'achievements-related feature.')
return non_nlp_features
def parse_games_string(games_string: str) -> set:
"""
Parse games string passed in via the command-line into a set of
valid games (or the value "all" for all games).
:param games_string: comma-separated list of games (or "all")
:type games_string: str
:returns: set of games
:rtype: set
:raises ValueError: if unrecognized game IDs are found in the input
"""
# Return empty set for empty string
if not games_string:
return set()
# Return the full list of games if 'all' is used
if games_string == 'all':
return set(VALID_GAMES)
# Parse string
specified_games = games_string.split(',')
# Raise exception if the list contains unrecognized games
if any(game not in VALID_GAMES for game in specified_games):
raise ValueError('Found unrecognized games in the list of specified '
'games: {0}. These are the valid games (in addition '
'to using "all" for all games): {1}.'
.format(', '.join(specified_games),
', '.join(VALID_GAMES)))
return set(specified_games)