# Incremental Learning with scikit-learn

In [2]:
from src.features import *
from util.datasets import *
from util.mongodb import *

import numpy as np
from bson import BSON
import matplotlib
# Force matplotlib to not use any Xwindows backend.
#matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans # Non-incremental learning
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_selection import (chi2,
                                       SelectKBest)
from sklearn.naive_bayes import (BernoulliNB,
                                 MultinomialNB)
from sklearn.linear_model import (Perceptron,
                                  SGDRegressor,
                                  SGDClassifier,
                                  PassiveAggressiveRegressor,
                                  PassiveAggressiveClassifier)
from sklearn.decomposition import (IncrementalPCA,
                                   MiniBatchDictionaryLearning)
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import (r2_score,
                             precision_score,
                             f1_score,
                             average_precision_score,
                             accuracy_score,
                             confusion_matrix)
from scipy.stats import pearsonr

In [3]:
# Running Mongo server on port 2700 and running on my own personal server
# ('localhost' in this case)
host = 'localhost'
port = 2700
db = connect_to_db(host=host,
                   port=port)

In [4]:
# Number of training/test reviews across all games
db.count()

54051

In [5]:
# List games that the database contains data for
! ls ../data/*jsonlines | awk -F/ '{print $NF}'

Arma_3.jsonlines
Counter_Strike_Global_Offensive.jsonlines
Counter_Strike.jsonlines
Dota_2.jsonlines
Football_Manager_2015.jsonlines
Garrys_Mod.jsonlines
Grand_Theft_Auto_V.jsonlines
sample.jsonlines
Sid_Meiers_Civilization_5.jsonlines
Team_Fortress_2.jsonlines
The_Elder_Scrolls_V.jsonlines
Warframe.jsonlines


In [6]:
# Let's get a sense for the kind of data that is contained in each document
# (not including the NLP features, which have to be decoded, anyway)
db.find_one({},
            {'nlp_features': 0})

{'_id': ObjectId('560394d3cbb14611d0957f1c'),
 'achievement_progress': {'num_achievements_attained': 7,
  'num_achievements_percentage': 0.16279069767441862,
  'num_achievements_possible': 43},
 'appid': '107410',
 'bin_factor': 2.0,
 'bin_ranges': [[0.0, 338.1], [338.2, 1014.4], [1014.5, 2367.0]],
 'binarized': True,
 'date_posted': 'Dec 15, 2013, 7:32PM',
 'date_updated': None,
 'found_helpful_percentage': 0.5,
 'friend_player_level': 7,
 'game': 'Arma_3',
 'id_string': '560394d3cbb14611d0957f1c',
 'nbins': 3,
 'num_badges': 5,
 'num_comments': 1,
 'num_found_funny': 0,
 'num_found_helpful': 2,
 'num_found_unhelpful': 2,
 'num_friends': 35,
 'num_games_owned': 75,
 'num_groups': 7,
 'num_guides': 0,
 'num_reviews': 1,
 'num_screenshots': 789,
 'num_voted_helpfulness': 4,
 'num_workshop_items': 1,
 'orig_url': 'http://steamcommunity.com/app/107410/homecontent/?userreviewsoffset=5150&p=1&itemspage=516&screenshotspage=516&videospage=516&artpage=516&allguidepage=516&webguidepage=516&inte

In [7]:
# Feature types
db.find_one({},
            {'nlp_features': 0}).keys()

dict_keys(['nbins', 'total_game_hours_bin', 'num_voted_helpfulness', '_id', 'friend_player_level', 'rating', 'bin_factor', 'num_workshop_items', 'orig_url', 'num_found_unhelpful', 'found_helpful_percentage', 'game', 'num_comments', 'appid', 'review_url', 'steam_id_number', 'achievement_progress', 'num_friends', 'profile_url', 'num_groups', 'binarized', 'date_updated', 'num_found_helpful', 'num_badges', 'num_reviews', 'num_games_owned', 'total_game_hours', 'num_found_funny', 'num_guides', 'username', 'total_game_hours_last_two_weeks', 'num_screenshots', 'partition', 'bin_ranges', 'id_string', 'review', 'date_posted'])

In [8]:
# Let's use Arma 3
game_id = 'Arma_3'
# Create cursors for clustering/exploration and training/test sets
# (limiting the test set to 500 for now)
#dev_cursor = db.find({'game': game_id,
#                      'partition': 'training'},
#                     timeout=False)
#dev_cursor.batch_size = 20
train_cursor = db.find({'game': game_id,
                        'partition': 'training'},
                       timeout=False)
train_cursor.batch_size = 20
test_cursor = db.find({'game': game_id,
                       'partition': 'test'},
                      timeout=False).limit(500)
test_cursor.batch_size = 20

In [9]:
# Let's make a training set of 100 reviews that includes all of the NLP
# features + most of the other features like "num_found_funny", etc. and
# where the thing we're trying to predict is 'total_game_hours_bin'.
# We will also make a test set that consists of all test set review
# documents.
non_nlp_feature_types = ['num_guides', 'num_games_owned', 'num_friends',
                         'num_voted_helpfulness', 'num_groups',
                         'num_workshop_items', 'num_reviews',
                         'num_found_funny', 'friend_player_level',
                         'num_badges', 'num_found_helpful',
                         'num_screenshots', 'num_found_unhelpful',
                         'found_helpful_percentage', 'num_comments']
hours_feature = 'total_game_hours_bin'
_id = 'id_string'

In [10]:
def get_dev_data(training_data_cursor,
                 inc_size,
                 non_nlp_features_to_use):
    '''
    Get a list of development data dictionaries to use in dataset
    exploration.

    :param training_data_cursor: cursor for training partition
                                 documents
    :type training_data_cursor: pymongo.cursor.Cursor object
    :param inc_size: number of documents to extract
    :type inc_size: int
    :param non_nlp_features_to_use: list of non-NLP features to add into
                                    the feature dictionaries 
    :type non_nlp_features_to_use: list of str
    :returns: list of dict
    '''

    if inc_size < 1:
        raise ValueError('inc_size parameter should be positive integer')
    data = []
    i = 0
    while i < inc_size:
        try:
            review_doc = next(training_data_cursor)
        except StopIteration:
            i = inc_size
            continue
        review_doc_get = review_doc.get
        # Make dictionary of features
        features = {feat: val for feat, val
                    in BSON.decode(review_doc_get('nlp_features')).items()
                    if val and val != float("NaN")}
        features.update({feat: review_doc_get(feat)
                         for feat in non_nlp_features_to_use
                         if feat != 'achievement_progress'
                            and review_doc_get(feat)
                            and review_doc_get(feat) != float("NaN")})
        # Add in the rest of the features
        if 'achievement_progress' in non_nlp_features_to_use:
            features.update({feat: val for feat, val
                             in review_doc_get('achievement_progress').items()
                             if val and val != float("NaN")})
        data.append(features)
        i += 1
    return data

In [11]:
# Get development data
#dev_data = get_dev_data(dev_cursor,
#                        500,
#                        non_nlp_feature_types + [hours_feature])

In [12]:
# Vectorize the data
#v_dev = DictVectorizer(sparse=True)
#X_dev = v_dev.fit_transform(dev_data)

In [13]:
# Make a K-means clusterer and fit it with the development data
#num_clusters = 10
#km = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)
#km.fit(X_dev)

In [14]:
#clusters = km.labels_.tolist()

In [15]:
#clusters[:10]

In [16]:
#Counter(clusters)

In [17]:
#km.cluster_centers_.argsort()

In [18]:
#labels = km.labels_
#cluster_centers = km.cluster_centers_
#km_labels_unique = np.unique(labels)

In [19]:
#pca = PCA(n_components=num_clusters).fit(X_dev.toarray())

In [20]:
#km_pca = KMeans(init=pca.components_, n_clusters=num_clusters, n_init=1)
#km_pca.fit(X_dev)

In [21]:
#km_pca.labels_

In [22]:
def get_train_data_iteration(training_data_cursor,
                             inc_size,
                             non_nlp_features_to_use,
                             y_feature):
    '''
    Get a list of training data dictionaries to use in model training.

    :param training_data_cursor: cursor for training documents
    :type training_data_cursor: pymongo.cursor.Cursor object
    :param inc_size: number of training documents to extract
    :type inc_size: int
    :param non_nlp_features_to_use: list of non-NLP features to add into
                                    the feature dictionaries 
    :type non_nlp_features_to_use: list of str
    :param y_feature: feature to use as the "score"
    :type y_feature: str
    :returns: list of dict
    '''

    if inc_size < 1:
        raise ValueError('inc_size parameter should be positive integer')
    if y_feature in non_nlp_features_to_use:
        raise Exception('y_feature must be a feature that is not in the '
                        'non_nlp_features_to_use list')
    data = []
    i = 0
    while i < inc_size:
        try:
            review_doc = next(training_data_cursor)
        except StopIteration:
            i = inc_size
            continue
        review_doc_get = review_doc.get
        # Make dictionary of features
        features = {feat: val for feat, val
                    in BSON.decode(review_doc_get('nlp_features')).items()
                    if val and val != float("NaN")}
        features.update({feat: review_doc_get(feat)
                         for feat in non_nlp_features_to_use
                         if feat != 'achievement_progress'
                            and review_doc_get(feat)
                            and review_doc_get(feat) != float("NaN")})
        # Add in the rest of the features
        if 'achievement_progress' in non_nlp_features_to_use:
            features.update({feat: val for feat, val
                             in review_doc_get('achievement_progress').items()
                             if val and val != float("NaN")})
        data.append(dict(y=review_doc_get(y_feature),
                         id=review_doc_get(_id),
                         x=features))
        i += 1
    return data

In [23]:
def get_test_data(test_data_cursor,
                  non_nlp_features_to_use,
                  y_feature):
    '''
    Get a list of test data dictionaries to use in model testing.

    :param test_data_cursor: cursor for test documents
    :type test_data_cursor: pymongo.cursor.Cursor object
    :param non_nlp_features_to_use: list of non-NLP features to add into
                                    the feature dictionaries 
    :type non_nlp_features_to_use: list of str
    :param y_feature: feature to use as the "score"
    :type y_feature: str
    :returns: list of dict
    '''

    if y_feature in non_nlp_features_to_use:
        raise Exception('y_feature must be a feature that is not in the '
                        'non_nlp_features_to_use list')
    data = []
    for review_doc in test_data_cursor:
        review_doc_get = review_doc.get
        # Make dictionary of features
        features = {feat: val for feat, val
                    in BSON.decode(review_doc_get('nlp_features')).items()
                    if val and val != float("NaN")}
        features.update({feat: review_doc_get(feat)
                         for feat in non_nlp_features_to_use
                         if feat != 'achievement_progress'
                            and review_doc_get(feat)
                            and review_doc_get(feat) != float("NaN")})
        # Add in the rest of the features
        if 'achievement_progress' in non_nlp_features_to_use:
            features.update({feat: val for feat, val
                             in review_doc_get('achievement_progress').items()
                             if val and val != float("NaN")})
        data.append(dict(y=review_doc_get(y_feature),
                         id=review_doc_get(_id),
                         x=features))
    return data

In [24]:
def get_stats(_y_test, _y_preds):
    """
    Get some statistics about the model's performance.

    :param _y_test: test set values
    :type _y_test: np.array
    :param _y_train: predictions
    :type _y_train: np.array
    :returns: tuple
    """

    _pearsonr = pearsonr(y_test,
                           y_train_preds_1)
    _r2 = r2_score(_y_test,
                   _y_preds)
    _prec_micro = precision_score(_y_test,
                                  _y_preds,
                                  labels=classes,
                                  average='micro')
    _prec_macro = precision_score(_y_test,
                                  _y_preds,
                                  labels=classes,
                                  average='macro')
    _prec_weighted = precision_score(_y_test,
                                     _y_preds,
                                     labels=classes,
                                     average='weighted')
    _f1_micro = f1_score(_y_test,
                         _y_preds,
                         labels=classes,
                         average='micro')
    _f1_macro = f1_score(_y_test,
                         _y_preds,
                         labels=classes,
                         average='macro')
    _f1_weighted = f1_score(_y_test,
                            _y_preds,
                            labels=classes,
                            average='weighted')
    _acc = accuracy_score(_y_test,
                          _y_preds,
                          normalize=True)
    _conf_mat = confusion_matrix(_y_test,
                                 _y_preds,
                                 labels=classes)
    
    return (_pearsonr,
            _r2,
            _prec_micro,
            _prec_macro,
            _prec_weighted,
            _f1_micro,
            _f1_macro,
            _f1_weighted,
            _acc,
            _conf_mat)

In [25]:
test_data = get_test_data(test_cursor,
                          non_nlp_feature_types,
                          hours_feature)

In [26]:
# Sample test data point
print('id: ' + test_data[0]['id'])
print('total hours played value (binned): ' + str(test_data[0]['y']))
print('sample of features for review/reviewer: '
      + str(dict(list(test_data[0]['x'].items())[:100])))

id: 560394d4cbb14611d095887d
total hours played value (binned): 1
sample of features for review/reviewer: {"'s storyline": 1, 'like 270': 1, 'in rping': 1, '40 ft': 1, 'cause theres': 1, 'may mean': 1, 'expect strong': 1, 'with extreme': 1, 'decade': 1, 'triple-a': 1, 'brilliant': 1, '>killed': 1, 'asks why': 1, 'best bet': 1, 'ended whole': 1, '4000mhz': 1, 'scenario "': 1, 'out this': 1, 'no-scope again': 1, 'm': 1, 'play wasteland': 1, 'continue to': 1, 'hill through': 1, 'mysteries': 1, 'inducing and': 1, "n't really": 1, 'and add': 1, 'rediculous': 1, 'the supercomputer': 1, '.. with': 1, '! (': 1, 'hard': 1, 'not on': 1, 'and e': 1, 'out rather': 1, "cut '": 1, 'competent with': 1, 'have mod': 1, 'evolve will': 1, 'rewards are': 1, 'from other': 1, 'bugs da': 1, 'about who': 1, 'item system': 1, 'real names': 1, 'downside ,': 1, 'like rpg': 1, 'unique weapons/vehicles': 1, 'lagy i': 1, "thought i'd": 1, 'simulator experience': 1, 'there if': 1, 'grenades ,': 1, 'milsimulator': 1,

In [27]:
test_ids = np.array([_data['id'] for _data in test_data])
test_ids[:10]

array(['560394d4cbb14611d095887d', '560394d4cbb14611d095887e',
       '560394d4cbb14611d095887f', '560394d4cbb14611d0958880',
       '560394d4cbb14611d0958881', '560394d4cbb14611d0958882',
       '560394d4cbb14611d0958883', '560394d4cbb14611d0958884',
       '560394d4cbb14611d0958885', '560394d4cbb14611d0958886'], 
      dtype='<U24')

In [28]:
y_test = np.array([_data['y'] for _data in test_data])
y_test[:10]

array([1, 1, 1, 2, 2, 1, 1, 1, 1, 2])

In [29]:
test_feature_dicts = [_data['x'] for _data in test_data]
dict(list(test_feature_dicts[0].items())[:50])

{'! (': 1,
 "'s storyline": 1,
 '40 ft': 1,
 '4000mhz': 1,
 '>killed': 1,
 'about who': 1,
 'and add': 1,
 'and e': 1,
 'are stunning': 1,
 'asks why': 1,
 'best bet': 1,
 'brilliant': 1,
 'bugs da': 1,
 'but delivered': 1,
 'cause theres': 1,
 'competent with': 1,
 'continue to': 1,
 "cut '": 1,
 'decade': 1,
 'ended whole': 1,
 'evolve will': 1,
 'expect strong': 1,
 'exploding i': 1,
 'from other': 1,
 'grenades ,': 1,
 'hard': 1,
 'have mod': 1,
 'hill through': 1,
 'in rping': 1,
 'inducing and': 1,
 'm': 1,
 'may mean': 1,
 'mysteries': 1,
 "n't really": 1,
 'no-scope again': 1,
 'not on': 1,
 'only horribly-done': 1,
 'out rather': 1,
 'out this': 1,
 'packed first': 1,
 'play wasteland': 1,
 'real names': 1,
 'rediculous': 1,
 'rewards are': 1,
 'scenario "': 1,
 'stratis island': 1,
 'the supercomputer': 1,
 "thought i'd": 1,
 'triple-a': 1,
 'with extreme': 1}

## First Round of Learning

### Vectorize Features
- Vectorize the test set features
- Vectorize a small portion of the training features (the first 100) and all of the test features, partially train the model, and then repeat

In [30]:
train_data_1 = get_train_data_iteration(train_cursor,
                                        100,
                                        non_nlp_feature_types,
                                        hours_feature)

In [31]:
# Sample training data point
print('id: ' + train_data_1[0]['id'])
print('total hours played value (binned): ' + str(train_data_1[0]['y']))
print('sample of features for review/reviewer: '
      + str(dict(list(train_data_1[0]['x'].items())[:100])))

id: 560394d3cbb14611d0957f1c
total hours played value (binned): 2
sample of features for review/reviewer: {'odd': 1, 'cluster19': 1, 'play': 1, ' grea': 1, 'reat': 1, 'Acti': 1, '7 ': 1, 'om': 1, 'ial (': 1, 'ver': 1, ' Gr': 1, '10:AMOD:sound': 1, 'ga': 1, 'al th': 1, 'num_groups': 7, 'n ': 1, 'Actin': 1, 'eplay': 1, ' Ov': 1, 'g ': 1, '- voice': 1, 'miss': 1, 'at': 1, 'ra': 1, 'ics': 1, ' a': 1, 'ice A': 1, 'nd 1': 1, 'oi': 1, 'cluster63': 1, 'd ': 1, 'ddin': 1, 'os)': 1, '- Sou': 1, 'on s': 1, 'oddin': 1, 'ios': 1, 'g 1': 1, 'num_voted_helpfulness': 4, '10 -': 1, '5 - ': 1, 'a ': 1, 'potential': 1, 'ti': 1, ' (Mod': 1, 'sound 10': 1, 'me': 1, ' mis': 1, 'und': 1, 'num_comments': 1, ' a gr': 1, ' t': 1, 'ssi': 1, 'nario': 1, 'entia': 1, 'om m': 1, 'Moddi': 1, 'Game': 1, 'acting 10': 1, 'reat ': 1, 'Mod': 1, 'ting': 1, ' thi': 1, 'scenario:VMOD:custom': 1, 'ame': 1, 'y ': 1, '5 ': 1, 'th': 1, 'nd': 1, 'custo': 1, 'cen': 1, 'Voi': 1, 'e Ac': 1, 'al (M': 1, 'custom mission': 1, 'rap': 1,

In [32]:
train_ids_1 = np.array([_data['id'] for _data in train_data_1])
y_train_1 = np.array([_data['y'] for _data in train_data_1])
train_feature_dicts_1 = [_data['x'] for _data in train_data_1]
classes = np.unique(y_train_1)

In [33]:
v = DictVectorizer(sparse=True)

In [34]:
X_train_1 = v.fit_transform(train_feature_dicts_1)
X_test = v.transform(test_feature_dicts)

In [35]:
# Make Perceptron learner
learner = Perceptron()

In [36]:
learner.get_params()

{'alpha': 0.0001,
 'class_weight': None,
 'eta0': 1.0,
 'fit_intercept': True,
 'n_iter': 5,
 'n_jobs': 1,
 'penalty': None,
 'random_state': 0,
 'shuffle': True,
 'verbose': 0,
 'warm_start': False}

In [37]:
learner.partial_fit(X_train_1,
                    y_train_1,
                    classes=classes)

Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)

In [38]:
y_train_preds_1 = learner.predict(X_test)
[(y, y_pred) for y, y_pred in zip(y_test,
                                  y_train_preds_1)]

[(1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (2, 1),
 (3, 1),
 (2, 1),
 (1, 1),
 (2, 1),
 (2, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 2),
 (2, 1),
 (1, 3),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 2),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 3),
 (2, 1),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (3, 1),
 (2, 1),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (2, 3),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 

In [39]:
(pearsonr_1,
 r2_1,
 prec_micro_1,
 prec_macro_1,
 prec_weighted_1,
 f1_micro_1,
 f1_macro_1,
 f1_weighted_1,
 acc_1,
 conf_mat_1) = get_stats(y_test,
                         y_train_preds_1)

In [40]:
print('Some stats\n\n')
print('Pearson: {} (p = {})'.format(*pearsonr_1))
print('r2 score: {}'.format(r2_1))
print('Micro precision score: {}'.format(prec_micro_1))
print('Macro precision score: {}'.format(prec_macro_1))
print('Weighted precision score: {}'.format(prec_weighted_1))
print('Micro f1 score: {}'.format(f1_micro_1))
print('Macro f1 score: {}'.format(f1_macro_1))
print('Weighted f1 score: {}'.format(f1_weighted_1))
print('Accuracy score: {}'.format(acc_1))
print('\nConfusion matrix:\n\n{}'.format(conf_mat_1))

Some stats


Pearson: 0.010476108371144748 (p = 0.8152388425640384)
r2 score: -0.6156452332437088
Micro precision score: 0.686
Macro precision score: 0.30602240896358546
Weighted precision score: 0.5489663865546218
Micro f1 score: 0.686
Macro f1 score: 0.29319793047827014
Weighted f1 score: 0.5917310324616611
Accuracy score: 0.686

Confusion matrix:

[[341   6  11]
 [109   1   5]
 [ 26   0   1]]


## Second Round of Learning

In [41]:
train_data_2 = get_train_data_iteration(train_cursor,
                                        100,
                                        non_nlp_feature_types,
                                        hours_feature)
# Sample training data point
print('id: ' + train_data_2[0]['id'])
print('total hours played value (binned): ' + str(train_data_2[0]['y']))
print('sample of features for review/reviewer: '
      + str(dict(list(train_data_2[0]['x'].items())[:100])))

id: 560394d3cbb14611d0957f80
total hours played value (binned): 1
sample of features for review/reviewer: {'wrong game,this': 1, 'in rping': 1, 'your kdr': 1, 'therefore': 1, 'game of': 1, 'campaign are': 1, 'bugs': 1, 'so i': 1, 'only as': 1, 'real stress': 1, 'options of': 1, 'better': 1, 'for work': 1, 'job': 1, 'scenario "': 1, 'if he': 1, 'there are': 1, 'mods are': 1, 'on the': 1, 'rt': 1, 'hill through': 1, 'by 4': 1, ' act': 1, 'telling': 1, 'and glitches': 1, 'them ,': 1, ', especailly': 1, 'hard': 1, 'open-ended goals': 1, 'and e': 1, 'for them': 1, 'cluster63': 1, 'will continue': 1, 'an objective': 1, 's, ': 1, 'playing this': 1, 'requires': 1, 'mods providing': 1, 'ich ': 1, '10-25fps': 1, 'actual': 1, 'great and': 1, 'endless amounts': 1, 'support': 1, 'accessible': 1, 'combat .': 1, 'guns': 1, 'chatting': 1, 'me this': 1, 'definetly': 1, 'moves': 1, 'inves': 1, 'mean huge.extreme': 1, 'sounds like': 1, ', go': 1, 'physics': 1, 'experience .': 1, 'finally knew': 1, 'f m':

In [42]:
train_ids_2 = np.array([_data['id'] for _data in train_data_2])
y_train_2 = np.array([_data['y'] for _data in train_data_2])
train_feature_dicts_2 = [_data['x'] for _data in train_data_2]

In [43]:
X_train_2 = v.transform(train_feature_dicts_2)

In [44]:
X_test = v.transform(test_feature_dicts)

In [45]:
learner.partial_fit(X_train_2,
                    y_train_2)

Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)

In [46]:
y_train_preds_2 = learner.predict(X_test)
[(y, y_pred) for y, y_pred in zip(y_test,
                                  y_train_preds_2)]

[(1, 1),
 (1, 1),
 (1, 2),
 (2, 1),
 (2, 1),
 (1, 3),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (2, 1),
 (3, 2),
 (2, 1),
 (1, 1),
 (2, 1),
 (2, 3),
 (2, 3),
 (1, 2),
 (1, 1),
 (1, 3),
 (2, 1),
 (1, 1),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (3, 1),
 (1, 2),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (2, 1),
 (1, 1),
 (1, 3),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 2),
 (1, 3),
 (2, 1),
 (1, 1),
 (3, 1),
 (1, 3),
 (1, 1),
 (1, 1),
 (2, 2),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 2),
 (1, 1),
 (1, 2),
 (1, 2),
 (1, 1),
 (1, 2),
 (2, 2),
 (1, 2),
 (1, 1),
 (3, 1),
 (2, 2),
 (1, 1),
 (3, 2),
 (1, 1),
 (1, 1),
 (2, 3),
 (1, 1),
 (1, 2),
 (1, 1),
 (2, 3),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 2),
 (1, 2),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (2, 2),
 (1, 1),
 (1, 2),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 2),
 (1, 1),
 (1, 3),
 (2, 1),
 (1, 1),
 

In [47]:
(pearsonr_2,
 r2_2,
 prec_micro_2,
 prec_macro_2,
 prec_weighted_2,
 f1_micro_2,
 f1_macro_2,
 f1_weighted_2,
 acc_2,
 conf_mat_2) = get_stats(y_test,
                         y_train_preds_2)
print('Some stats\n\n')
print('Pearson: {} (p = {})'.format(*pearsonr_2))
print('r2 score: {}'.format(r2_2))
print('Micro precision score: {}'.format(prec_micro_2))
print('Macro precision score: {}'.format(prec_macro_2))
print('Weighted precision score: {}'.format(prec_weighted_2))
print('Micro f1 score: {}'.format(f1_micro_2))
print('Macro f1 score: {}'.format(f1_macro_2))
print('Weighted f1 score: {}'.format(f1_weighted_2))
print('Accuracy score: {}'.format(acc_2))
print('\nConfusion matrix:\n\n{}'.format(conf_mat_2))

Some stats


Pearson: 0.010476108371144748 (p = 0.8152388425640384)
r2 score: -0.9532427446677674
Micro precision score: 0.604
Macro precision score: 0.3745277917622271
Weighted precision score: 0.6159959277283661
Micro f1 score: 0.604
Macro f1 score: 0.3722430295464003
Weighted f1 score: 0.6094113607990013
Accuracy score: 0.604

Confusion matrix:

[[269  62  27]
 [ 70  30  15]
 [ 15   9   3]]


## Third Round of Learning

In [48]:
train_data_3 = get_train_data_iteration(train_cursor,
                                        100,
                                        non_nlp_feature_types,
                                        hours_feature)
print('id: ' + train_data_3[0]['id'])
print('total hours played value (binned): ' + str(train_data_3[0]['y']))
print('sample of features for review/reviewer: '
      + str(dict(list(train_data_3[0]['x'].items())[:100])))

id: 560394d3cbb14611d0957fe4
total hours played value (binned): 1
sample of features for review/reviewer: {'in rping': 1, 'game of': 1, 'bugs': 1, 'lakeside': 1, 'only as': 1, 'options of': 1, '. allowing': 1, 'easy to': 1, 'everyday !': 1, 'scenario "': 1, 'out this': 1, 'realistic but': 1, 'houses/outhouses/farms/industrial': 1, 'mods are': 1, 'd dog': 1, 'rt': 1, 'hill through': 1, "n't really": 1, ', especailly': 1, 'die ,': 1, 'hard': 1, 'open-ended goals': 1, 'very:ROOT:and': 1, 'for them': 1, 'cluster63': 1, 'spots .': 1, 'an objective': 1, 'for people': 1, 'requires': 1, 'mods providing': 1, 'best pcs': 1, 'item system': 1, 'great and': 1, 'endless amounts': 1, 'military simulators': 1, 'partner then': 1, 'combat .': 1, 'call air': 1, 'chatting': 1, 'moves': 1, 'bleeding to': 1, 'me this': 1, 'at medium': 1, 'game truly': 1, 'physics': 1, 'location you': 1, 'grandmother and': 1, 'and group': 1, 'gun given': 1, 'saw about': 1, 'owned': 1, ' goo': 1, "game 's": 1, 'r9 280x': 1, '

In [49]:
train_ids_3 = np.array([_data['id'] for _data in train_data_3])
y_train_3 = np.array([_data['y'] for _data in train_data_3])
train_feature_dicts_3 = [_data['x'] for _data in train_data_3]

In [50]:
X_train_3 = v.transform(train_feature_dicts_3)

In [51]:
X_test = v.transform(test_feature_dicts)

In [52]:
learner.partial_fit(X_train_3,
                    y_train_3)

Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)

In [53]:
y_train_preds_3 = learner.predict(X_test)
[(y, y_pred) for y, y_pred in zip(y_test,
                                  y_train_preds_3)]

[(1, 1),
 (1, 2),
 (1, 1),
 (2, 1),
 (2, 2),
 (1, 3),
 (1, 2),
 (1, 1),
 (1, 2),
 (2, 1),
 (2, 1),
 (3, 1),
 (2, 1),
 (1, 2),
 (2, 3),
 (2, 3),
 (2, 3),
 (1, 1),
 (1, 1),
 (1, 3),
 (2, 1),
 (1, 2),
 (1, 1),
 (3, 2),
 (1, 2),
 (1, 2),
 (1, 2),
 (1, 2),
 (1, 2),
 (1, 2),
 (1, 2),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 2),
 (1, 2),
 (1, 1),
 (1, 2),
 (1, 2),
 (1, 2),
 (1, 2),
 (1, 2),
 (2, 1),
 (2, 2),
 (1, 2),
 (1, 3),
 (1, 1),
 (2, 2),
 (1, 1),
 (1, 2),
 (1, 1),
 (1, 3),
 (2, 2),
 (1, 2),
 (3, 2),
 (1, 3),
 (1, 2),
 (1, 2),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 2),
 (1, 1),
 (1, 2),
 (1, 1),
 (1, 2),
 (1, 1),
 (1, 1),
 (1, 2),
 (1, 3),
 (2, 1),
 (1, 1),
 (1, 2),
 (3, 1),
 (2, 1),
 (1, 2),
 (3, 1),
 (1, 2),
 (1, 2),
 (2, 3),
 (1, 2),
 (1, 1),
 (1, 1),
 (2, 3),
 (1, 1),
 (1, 2),
 (1, 2),
 (1, 2),
 (2, 1),
 (1, 2),
 (1, 2),
 (1, 1),
 (1, 2),
 (1, 2),
 (2, 2),
 (1, 1),
 (2, 1),
 (1, 2),
 (1, 1),
 (1, 1),
 (3, 1),
 (1, 3),
 (1, 2),
 (1, 2),
 (2, 1),
 (1, 1),
 (1, 2),
 (2, 2),
 (1, 3),
 

In [54]:
(pearsonr_3,
 r2_3,
 prec_micro_3,
 prec_macro_3,
 prec_weighted_3,
 f1_micro_3,
 f1_macro_3,
 f1_weighted_3,
 acc_3,
 conf_mat_3) = get_stats(y_test,
                         y_train_preds_3)
print('Some stats\n\n')
print('Pearson: {} (p = {})'.format(*pearsonr_3))
print('r2 score: {}'.format(r2_3))
print('Micro precision score: {}'.format(prec_micro_3))
print('Macro precision score: {}'.format(prec_macro_3))
print('Weighted precision score: {}'.format(prec_weighted_3))
print('Micro f1 score: {}'.format(f1_micro_3))
print('Macro f1 score: {}'.format(f1_macro_3))
print('Weighted f1 score: {}'.format(f1_weighted_3))
print('Accuracy score: {}'.format(acc_3))
print('\nConfusion matrix:\n\n{}'.format(conf_mat_3))

Some stats


Pearson: 0.010476108371144748 (p = 0.8152388425640384)
r2 score: -2.1348340346519725
Micro precision score: 0.35
Macro precision score: 0.30391928946113494
Weighted precision score: 0.5169084763948498
Micro f1 score: 0.35
Macro f1 score: 0.2696981990288278
Weighted f1 score: 0.39151321900547054
Accuracy score: 0.35

Confusion matrix:

[[126 183  49]
 [ 50  44  21]
 [ 16   6   5]]


## Fourth Round of Learning

In [55]:
train_data_4 = get_train_data_iteration(train_cursor,
                                        100,
                                        non_nlp_feature_types,
                                        hours_feature)
print('id: ' + train_data_4[0]['id'])
print('total hours played value (binned): ' + str(train_data_4[0]['y']))
print('sample of features for review/reviewer: '
      + str(dict(list(train_data_4[0]['x'].items())[:100])))

id: 560394d3cbb14611d0958048
total hours played value (binned): 1
sample of features for review/reviewer: {'in rping': 1, 'my take': 1, 'game of': 1, 'doubt ,': 1, 'bugs': 1, 'lakeside': 1, 'only as': 1, 'options of': 1, ', restart': 1, '. allowing': 1, '>killed': 1, 'asks why': 1, 'koth online': 1, 'glorious but': 1, 'scenario "': 1, 'out this': 1, 'realistic but': 1, 'houses/outhouses/farms/industrial': 1, 'mods are': 1, 'sandbox open': 1, 'fps since': 1, 'continue to': 1, 'hill through': 1, "n't really": 1, 'played dayz': 1, ', especailly': 1, 'die ,': 1, 'hard': 1, 'not on': 1, 'and e': 1, 'for them': 1, 'that refers': 1, 'competent with': 1, "n't see": 1, 'dem': 1, 'an objective': 1, 'for people': 1, 'great game': 1, 'requires': 1, 'mods providing': 1, 'ai in': 1, 'day and': 1, 'best pcs': 1, 'item system': 1, 'great and': 1, 'endless amounts': 1, 'military simulators': 1, 'partner then': 1, 'combat .': 1, 'call air': 1, 'chatting': 1, 'releasing': 1, 'moves': 1, 'bleeding to': 1,

In [56]:
train_ids_4 = np.array([_data['id'] for _data in train_data_4])
y_train_4 = np.array([_data['y'] for _data in train_data_4])
train_feature_dicts_4 = [_data['x'] for _data in train_data_4]

In [57]:
X_train_4 = v.transform(train_feature_dicts_4)

In [58]:
X_test = v.transform(test_feature_dicts)

In [59]:
learner.partial_fit(X_train_4,
                    y_train_4)

Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)

In [60]:
y_train_preds_4 = learner.predict(X_test)
[(y, y_pred) for y, y_pred in zip(y_test,
                                  y_train_preds_4)]

[(1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (2, 1),
 (3, 1),
 (2, 1),
 (1, 1),
 (2, 1),
 (2, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 3),
 (2, 1),
 (1, 1),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (2, 1),
 (1, 2),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (3, 1),
 (2, 1),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (3, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 (1, 1),
 (2, 1),
 (1, 1),
 

In [61]:
(pearsonr_4,
 r2_4,
 prec_micro_4,
 prec_macro_4,
 prec_weighted_4,
 f1_micro_4,
 f1_macro_4,
 f1_weighted_4,
 acc_4,
 conf_mat_4) = get_stats(y_test,
                         y_train_preds_4)
print('Some stats\n\n')
print('Pearson: {} (p = {})'.format(*pearsonr_4))
print('r2 score: {}'.format(r2_4))
print('Micro precision score: {}'.format(prec_micro_4))
print('Macro precision score: {}'.format(prec_macro_4))
print('Weighted precision score: {}'.format(prec_weighted_4))
print('Micro f1 score: {}'.format(f1_micro_4))
print('Macro f1 score: {}'.format(f1_macro_4))
print('Weighted f1 score: {}'.format(f1_weighted_4))
print('Accuracy score: {}'.format(acc_4))
print('\nConfusion matrix:\n\n{}'.format(conf_mat_4))

Some stats


Pearson: 0.010476108371144748 (p = 0.8152388425640384)
r2 score: -0.48301763946997145
Micro precision score: 0.7
Macro precision score: 0.3346100759144237
Weighted precision score: 0.5663757763975156
Micro f1 score: 0.7
Macro f1 score: 0.2985116416150899
Weighted f1 score: 0.5991542882404951
Accuracy score: 0.7

Confusion matrix:

[[348   3   7]
 [110   1   4]
 [ 25   1   1]]


## Fifth Round of Learning