# Incremental Learning with scikit-learn

In [1]:
from operator import or_

from src.features import *
from util.datasets import *
from util.mongodb import *

import numpy as np
import pandas as pd
from bson import BSON
from pymongo import cursor
import matplotlib
# Force matplotlib to not use any Xwindows backend.
#matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans # Non-incremental learning
from sklearn.feature_selection import (chi2,
                                       SelectKBest)
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.naive_bayes import (BernoulliNB,
                                 MultinomialNB)
from sklearn.linear_model import (Perceptron,
                                  SGDRegressor,
                                  PassiveAggressiveRegressor)
from sklearn.decomposition import (IncrementalPCA,
                                   MiniBatchDictionaryLearning)
from sklearn.feature_extraction import DictVectorizer

from sklearn.grid_search import ParameterGrid
from skll.metrics import kappa
from sklearn.metrics import (r2_score,
                             precision_score,
                             f1_score,
                             average_precision_score,
                             accuracy_score,
                             confusion_matrix)
from scipy.stats import pearsonr

In [2]:
# Filter out warnings since there will be a lot of "UndefinedMetricWarning"
# warnings when running IncrementalLearning
import warnings
warnings.filterwarnings("ignore")

In [3]:
seed = 123456789

In [4]:
_DEFAULT_PARAM_GRIDS = \
    {PCA: {'n_components': [None, 'mle'],
           'whiten': [True, False]},
     MiniBatchKMeans: {'n_clusters': [4, 6, 8, 12],
                       'init' : ['k-means++', 'random'],
                       'random_state': [seed]},
     BernoulliNB: {'alpha': [0.1, 0.25, 0.5, 0.75, 1.0]},
     MultinomialNB: {'alpha': [0.1, 0.25, 0.5, 0.75, 1.0]},
     Perceptron: {'penalty': [None, 'l2', 'l1', 'elasticnet'],
                  'alpha': [0.0001, 0.001, 0.01, 0.1],
                  'n_iter': [5, 10],
                  'random_state': [seed]},
     SGDRegressor: {'alpha': [0.000001, 0.00001, 0.0001, 0.001,
                              0.01],
                    'penalty': ['l1', 'l2', 'elasticnet']},
     PassiveAggressiveRegressor:
         {'C': [0.01, 0.1, 1.0, 10.0, 100.0],
          'n_iter': [5, 10],
          'random_state': [seed],
          'loss': ['epsilon_insensitive',
                   'squared_epsilon_insensitive']},
     IncrementalPCA: {'whiten': [True, False]},
     MiniBatchDictionaryLearning:
         {'n_components': [100, 500, 1000, 10000],
          'n_iter': [5, 10, 15],
          'fit_algorithm': ['lars'],
          'transform_algorithm': ['lasso_lars'],
          'random_state': [seed]}}

In [5]:
learners = {'mbkm': MiniBatchKMeans,
            'bnb': BernoulliNB,
            'mnb': MultinomialNB,
            'perc': Perceptron,
            'sgd': SGDRegressor,
            'pagr': PassiveAggressiveRegressor}

In [4]:
# Running Mongo server on port 37017 and running on my own personal server
# ('localhost' in this case)
# I ran "ssh -N -f -L localhost:37017:localhost:2700 mulhod@pool-108-24-47-200.cmdnnj.fios.verizon.net"
# previous to running the command below.
host = 'localhost'
port = 37017
db = connect_to_db(host=host,
                   port=port)

In [7]:
# Number of training/test reviews across all games
db.count()

54051

In [8]:
# List games that the database contains data for
! ls ../data/*jsonlines | awk -F/ '{print $NF}'

Arma_3.jsonlines
Counter_Strike_Global_Offensive.jsonlines
Counter_Strike.jsonlines
Dota_2.jsonlines
Football_Manager_2015.jsonlines
Garrys_Mod.jsonlines
Grand_Theft_Auto_V.jsonlines
sample.jsonlines
Sid_Meiers_Civilization_5.jsonlines
Team_Fortress_2.jsonlines
The_Elder_Scrolls_V.jsonlines
Warframe.jsonlines


In [43]:
# Let's get a sense for the kind of data that is contained in each document
# (not including the NLP features, which have to be decoded, anyway)
db.find_one({},
            {'nlp_features': 0})

{'_id': ObjectId('560394d3cbb14611d0957f1c'),
 'achievement_progress': {'num_achievements_attained': 7,
  'num_achievements_percentage': 0.16279069767441862,
  'num_achievements_possible': 43},
 'appid': '107410',
 'bin_factor': 2.0,
 'bin_ranges': [[0.0, 338.1], [338.2, 1014.4], [1014.5, 2367.0]],
 'binarized': True,
 'date_posted': 'Dec 15, 2013, 7:32PM',
 'date_updated': None,
 'found_helpful_percentage': 0.5,
 'friend_player_level': 7,
 'game': 'Arma_3',
 'id_string': '560394d3cbb14611d0957f1c',
 'nbins': 3,
 'num_badges': 5,
 'num_comments': 1,
 'num_found_funny': 0,
 'num_found_helpful': 2,
 'num_found_unhelpful': 2,
 'num_friends': 35,
 'num_games_owned': 75,
 'num_groups': 7,
 'num_guides': 0,
 'num_reviews': 1,
 'num_screenshots': 789,
 'num_voted_helpfulness': 4,
 'num_workshop_items': 1,
 'orig_url': 'http://steamcommunity.com/app/107410/homecontent/?userreviewsoffset=5150&p=1&itemspage=516&screenshotspage=516&videospage=516&artpage=516&allguidepage=516&webguidepage=516&inte

In [44]:
# Feature types
db.find_one({},
            {'nlp_features': 0}).keys()

dict_keys(['num_found_helpful', 'num_comments', 'num_found_funny', 'found_helpful_percentage', 'id_string', 'profile_url', 'nbins', 'total_game_hours_bin', 'friend_player_level', 'review', 'date_posted', 'num_groups', 'num_badges', 'achievement_progress', 'num_reviews', 'partition', 'num_workshop_items', 'date_updated', 'num_voted_helpfulness', 'orig_url', 'num_games_owned', 'bin_ranges', 'num_friends', 'bin_factor', 'total_game_hours', 'num_guides', 'review_url', 'total_game_hours_last_two_weeks', 'rating', 'num_screenshots', 'steam_id_number', 'username', 'num_found_unhelpful', 'game', '_id', 'binarized', 'appid'])

In [6]:
# Let's use Arma 3
game_id = 'Arma_3'
# Create cursors for clustering/exploration and training/test sets
# (limiting the test set to 500 for now)
#dev_cursor = db.find({'game': game_id,
#                      'partition': 'training'},
#                     timeout=False)
#dev_cursor.batch_size = 20
train_cursor = db.find({'game': game_id,
                        'partition': 'training'},
                       timeout=False)
train_cursor.batch_size = 50
test_cursor = db.find({'game': game_id,
                       'partition': 'test'},
                      timeout=False).limit(500)
test_cursor.batch_size = 50

In [8]:
train_cursor.count()

2401

In [51]:
# Let's make a training set of 100 reviews that includes all of the NLP
# features + most of the other features like "num_found_funny", etc. and
# where the thing we're trying to predict is 'total_game_hours_bin'.
# We will also make a test set that consists of all test set review
# documents.
non_nlp_feature_types = ['num_guides', 'num_games_owned', 'num_friends',
                         'num_voted_helpfulness', 'num_groups',
                         'num_workshop_items', 'num_reviews',
                         'num_found_funny', 'friend_player_level',
                         'num_badges', 'num_found_helpful',
                         'num_screenshots', 'num_found_unhelpful',
                         'found_helpful_percentage', 'num_comments']
hours_feature = 'total_game_hours_bin'
all_non_nlp_features = non_nlp_feature_types + [hours_feature]
_id = 'id_string'

In [53]:
class IncrementalLearning:
    '''
    Class for conducting incremental learning experiments with a
    parameter grid and a learner.
    '''

    # Constants
    __nlp_feats__ = 'nlp_features'
    __achieve_prog__ = 'achievement_progress'
    __nan__ = float("NaN")
    __x__ = 'x'
    __y__ = 'y'
    __id_string__ = 'id_string'
    __id__ = 'id'
    __learning_round__ = 'learning_round'
    __prediction_label__ = 'prediction_label'
    __test_labels_and_preds__ = 'test_set_labels/test_set_predictions'
    __learner__ = 'learner'
    __learners_requiring_classes__ = ['BernoulliNB', 'MultinomialNB',
                                      'Perceptron']
    __params__ = 'params'
    __training_samples__ = 'training_samples'
    __possible_non_nlp_features__ = \
        ['num_guides', 'num_games_owned', 'num_friends',
         'num_voted_helpfulness', 'num_groups', 'num_workshop_items',
         'num_reviews', 'num_found_funny', 'friend_player_level',
         'num_badges', 'num_found_helpful', 'num_screenshots',
         'num_found_unhelpful', 'found_helpful_percentage', 'num_comments',
         'total_game_hours', 'total_game_hours_bin',
         'total_game_hours_last_two_weeks', 'num_achievements_percentage',
         'num_achievements_possible']
    __tab_join__ = '\t'.join
    __cnfmat_row__ = '{}{}\n'.format
    __cnfmat_header__ = ('confusion_matrix (rounded predictions) '
                         '(row=human, col=machine, labels={}):\n')

    def __init__(self, learner, param_grid: dict,
                 training_data_cursor: cursor, test_data_cursor: cursor,
                 round_size: int, non_nlp_features: list, prediction_label: str,
                 rounds=0):
        '''
        Initialize class.

        :param learner: regression algorithm to use for learning
        :type learner: learner type
        :param param_grid: dictionary of parameters mapped to lists
                           of values
        :type param_grid: dict
        :param training_data_cursor: MongoDB cursor for training documents
        :type training_data_cursor: pymongo cursor object
        :param test_data_cursor: MongoDB cursor for test documents
        :type test_data_cursor: pymongo cursor object
        :param round_size: number of training documents to extract in
                           each round
        :type round_size: int
        :param non_nlp_features: list of non-NLP features to add into
                                 the feature dictionaries 
        :type non_nlp_features: list of str
        :param prediction_label: feature to predict
        :type prediction_label: str
        :param rounds: number of rounds of learning (0 for as many as
                       possible)
        :type rounds: int
        :returns: list of dict
        '''

        # Make sure parameters make some sense
        if round_size < 1:
            raise ValueError('The round_size parameter should have a positive'
                             ' value.')
        if prediction_label in non_nlp_features:
            raise Exception('The prediction_label parameter ({}) cannot also '
                            'be in the list of non-NLP features to use in the'
                            ' model:\n\n{}\n.'.format(prediction_label,
                                                      non_nlp_features))
        if not prediction_label in self.__possible_non_nlp_features__:
            raise Exception('The prediction label must be in the set of '
                            'features that can be extracted/used, i.e.: {}.'
                            .format(self.__possible_non_nlp_features__))

        # Learner-related variables
        self.vec = None
        self.params_list = list(ParameterGrid(param_grid))
        self.learner_name = (str(learner.__class__)
                             .rsplit('.', 1)[1]
                             .strip("'>"))
        self.learners = [learner(**kwparams) for kwparams in self.params_list]
        self.learner_stats = [[] for _ in self.learners]

        # Information about what features to use for what purposes
        if all([feat in self.__possible_non_nlp_features__ for feat
                in non_nlp_features]):
            self.non_nlp_features = non_nlp_features
        self.prediction_label = prediction_label

        # Information about incremental learning
        self.round_size = round_size
        self.rounds = rounds
        self.round = 1
        self.NO_MORE_TRAINING_DATA = False

        # Training/test data variables
        self.training_cursor = training_data_cursor
        self.test_cursor = test_data_cursor
        self.test_data = self.get_test_data()
        self.test_ids = [_data[self.__id__] for _data in self.test_data]
        self.test_feature_dicts = [_data[self.__x__] for _data
                                   in self.test_data]
        self.y_test = np.array([_data[self.__y__] for _data
                                in self.test_data])
        self.classes = np.unique(self.y_test)

        # Useful constants for use in make_printable_confusion_matrix
        self.cnfmat_desc = \
            self.__cnfmat_row__(self.__cnfmat_header__.format(self.classes),
                                self.__tab_join__([''] + [str(x) for x
                                                          in self.classes]))

        # Do incremental learning experiments
        self.do_learning_rounds()
        self.learner_stats = [pd.DataFrame(learner_stats) for learner_stats
                              in self.learner_stats]

    def get_stats(self, y_pred):
        """
        Get some statistics about the model's performance on the test
        set.

        :param y_pred: predictions
        :type y_pred: np.array
        :returns: dict
        """

        return {'pearson_r': pearsonr(self.y_test,
                                      y_pred),
                'r2': r2_score(self.y_test,
                               y_pred),
                'precision_micro': precision_score(self.y_test,
                                                   y_pred,
                                                   labels=self.classes,
                                                   average='micro'),
                'precision_macro': precision_score(self.y_test,
                                                   y_pred,
                                                   labels=self.classes,
                                                   average='macro'),
                'precision_weighted': precision_score(self.y_test,
                                                      y_pred,
                                                      labels=self.classes,
                                                      average='weighted'),
                'f1_micro': f1_score(self.y_test,
                                     y_pred,
                                     labels=self.classes,
                                     average='micro'),
                'f1_macro': f1_score(self.y_test,
                                     y_pred,
                                     labels=self.classes,
                                     average='macro'),
                'f1_weighted': f1_score(self.y_test,
                                        y_pred,
                                        labels=self.classes,
                                        average='weighted'),
                'accuracy': accuracy_score(self.y_test,
                                           y_pred,
                                           normalize=True),
                'confusion_matrix': confusion_matrix(self.y_test,
                                                     y_pred,
                                                     labels=self.classes),
                'printable_confusion_matrix':
                    self.make_printable_confusion_matrix(y_pred),
                'uwk': kappa(self.y_test,
                             y_pred),
                'uwk_off_by_one': kappa(self.y_test,
                                        y_pred,
                                        allow_off_by_one=True),
                'qwk': kappa(self.y_test,
                             y_pred,
                             weights='quadratic'),
                'qwk_off_by_one': kappa(self.y_test,
                                        y_pred,
                                        weights='quadratic',
                                        allow_off_by_one=True),
                'lwk': kappa(self.y_test,
                             y_pred,
                             weights='linear'),
                'lwk_off_by_one': kappa(self.y_test,
                                        y_pred,
                                        weights='linear',
                                        allow_off_by_one=True)}

    def make_printable_confusion_matrix(self, y_pred):
        '''
        Produce a printable confusion matrix to use in the evaluation
        report.

        :param y_pred: array-like of predicted labels
        :type y_pred: array-like
        :returns: str
        '''

        cnfmat = confusion_matrix(self.y_test,
                                  np.round(y_pred),
                                  labels=self.classes).tolist()
        res = str(self.cnfmat_desc)
        for row, label in zip(cnfmat,
                              self.classes):
            row = self.__tab_join__([str(x) for x in [label] + row])
            res = self.__cnfmat_row__(res, row)
        return res

    def get_all_features(self, review_doc):
        '''
        Get all the features in a review document and put them together
        in a dictionary.

        :param review_doc: review document from Mongo database
        :type review_doc: dict
        :returns: dict
        '''

        _get = review_doc.get
        # Add in the NLP features
        features = {feat: val for feat, val
                    in BSON.decode(_get(self.__nlp_feats__)).items()
                    if val and val != self.__nan__}
        # Add in the non-NLP features (except for those that may be in the
        # 'achievement_progress' sub-dictionary of the review dictionary
        features.update({feat: val
                         for feat, val in review_doc.items()
                         if feat in self.__possible_non_nlp_features__
                            and val
                            and val != self.__nan__})
        # Add in the features that may be in the 'achievement_progress'
        # sub-dictionary of the review dictionary
        features.update({feat: val for feat, val
                         in _get(self.__achieve_prog__,
                                 dict()).items()
                         if feat in self.__possible_non_nlp_features__
                            and val
                            and val != self.__nan__})
        # Add in the 'id_string' value just to make it easier to process the
        # results of this function
        features.update({self.__id_string__: _get(self.__id_string__)})
        return features

    def get_train_data_iteration(self):
        '''
        Get a list of training data dictionaries to use in model
        training.

        :returns: list of dict
        '''

        data = []
        i = 0
        while i < self.round_size:
            # Get a review document from the Mongo database
            try:
                review_doc = next(self.training_cursor)
            except StopIteration:
                self.NO_MORE_TRAINING_DATA = True
                break

            # Get dictionary containing all features needed + the ID
            # and the prediction label
            feature_dict = self.get_all_features(review_doc)
            _get = feature_dict.get

            # Get prediction label feature and remove it from feature
            # dictionary, skipping the document if it's not found or if
            # its value is None
            if _get(self.prediction_label):
                y_value = _get(self.prediction_label,
                               None)
                if y_value == None:
                    i += 1
                    continue
                del feature_dict[self.prediction_label]
            else:
                i += 1
                continue

            # Get ID and remove from feature dictionary
            id_string = _get(self.__id_string__)
            del feature_dict[self.__id_string__]

            # Put features, prediction label, and ID in a new
            # dictionary and append to list of data samples and then
            # increment the review counter
            data.append(dict(y=y_value,
                             id=id_string,
                             x=feature_dict))
            i += 1
        return data

    def get_test_data(self):
        '''
        Get a list of test data dictionaries to use in model
        evaluation.

        :returns: list of dict
        '''

        data = []
        for review_doc in self.test_cursor:
            # Get dictionary containing all features needed + the ID
            # and the prediction label
            feature_dict = self.get_all_features(review_doc)
            _get = feature_dict.get

            # Get prediction label feature and remove it from feature
            # dictionary, skipping the document if it's not found or if
            # its value is None
            if _get(self.prediction_label):
                y_value = _get(self.prediction_label,
                               None)
                if y_value == None:
                    i += 1
                    continue
                del feature_dict[self.prediction_label]
            else:
                i += 1
                continue

            # Get ID and remove from feature dictionary
            id_string = _get(self.__id_string__)
            del feature_dict[self.__id_string__]

            # Put features, prediction label, and ID in a new
            # dictionary and append to list of data samples and then
            # increment the review counter
            data.append(dict(y=y_value,
                             id=id_string,
                             x=feature_dict))
        return data

    def learning_round(self):
        '''
        Do learning rounds.
        '''

        # Get some training data
        train_data = self.get_train_data_iteration()
        samples = len(train_data)
        if not samples:
            return
        train_ids = np.array([_data[self.__id__] for _data in train_data])
        y_train = np.array([_data[self.__y__] for _data in train_data])
        train_feature_dicts = [_data[self.__x__] for _data in train_data]

        # Set _vec if not already set and fit it it the training
        # features, which will only need to be done the first time
        if self.vec == None:
            self.vec = DictVectorizer(sparse=True)
            X_train = self.vec.fit_transform(train_feature_dicts)
        else:
            X_train = self.vec.transform(train_feature_dicts)

        # Transform the test features
        X_test = self.vec.transform(self.test_feature_dicts)

        # Update the various models with differing parameters
        for i, learner in enumerate(self.learners):
            if (self.learner_name in self.__learners_requiring_classes__
                and self.round == 1):
                    learner.partial_fit(X_train,
                                        y_train,
                                        classes=self.classes)
            else:
                learner.partial_fit(X_train,
                                    y_train)
            y_test_preds = learner.predict(X_test)

            # Evaluate the new model, collecting metrics, etc., and then
            # store the round statistics
            stats_dict = self.get_stats(y_test_preds)
            stats_dict.update({self.__learning_round__: int(self.round),
                               self.__prediction_label__:
                                   self.prediction_label,
                               self.__test_labels_and_preds__:
                                   list(zip(self.y_test,
                                            y_test_preds)),
                               self.__learner__: self.learner_name,
                               self.__params__: learner.get_params(),
                               self.__training_samples__: samples})
            self.learner_stats[i].append(pd.Series(stats_dict))

        # Increment the round number
        self.round += 1

    def do_learning_rounds(self):
        '''
        Do rounds of learning.
        '''

        # If a certain number of rounds has been specified, try to do
        # that many rounds; otherwise, do as many as possible
        if self.rounds > 0:
            while self.round <= self.rounds:
                if self.NO_MORE_TRAINING_DATA:
                    break
                else:
                    self.learning_round()
        else:
            while True:
                if self.NO_MORE_TRAINING_DATA:
                    break
                self.learning_round()

In [60]:
perceptron_inc_learning = \
    IncrementalLearning(learners['perc'],
                        _DEFAULT_PARAM_GRIDS[learners['perc']],
                        train_cursor,
                        test_cursor,
                        100,
                        ['num_guides', 'num_games_owned', 'num_friends',
                         'num_voted_helpfulness', 'num_groups',
                         'num_workshop_items', 'num_reviews',
                         'num_found_funny', 'friend_player_level',
                         'num_badges', 'num_found_helpful',
                         'num_screenshots', 'num_found_unhelpful',
                         'found_helpful_percentage', 'num_comments',
                         'num_achievements_percentage',
                         'num_achievements_possible'],
                        'total_game_hours_bin',
                        rounds=50)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

In [65]:
for i, learner_stats in enumerate(perceptron_inc_learning.learner_stats):
    learner_stats.to_csv('./perceptron_inc_learning_learner_stats_{}.csv'.format(i),
                         index=False)

In [11]:
sgd_inc_learning = \
    IncrementalLearning(learners['sgd'],
                        _DEFAULT_PARAM_GRIDS[learners['sgd']],
                        train_cursor,
                        test_cursor,
                        100,
                        ['num_guides', 'num_games_owned', 'num_friends',
                         'num_voted_helpfulness', 'num_groups',
                         'num_workshop_items', 'num_reviews',
                         'num_found_funny', 'friend_player_level',
                         'num_badges', 'num_found_helpful',
                         'num_screenshots', 'num_found_unhelpful',
                         'found_helpful_percentage', 'num_comments',
                         'num_achievements_percentage',
                         'num_achievements_possible'],
                        'total_game_hours_bin',
                        rounds=50)

TypeError: partial_fit() got an unexpected keyword argument 'classes'

In [12]:
def get_dev_data(training_data_cursor,
                 inc_size,
                 non_nlp_features_to_use):
    '''
    Get a list of development data dictionaries to use in dataset
    exploration.

    :param training_data_cursor: cursor for training partition
                                 documents
    :type training_data_cursor: pymongo.cursor.Cursor object
    :param inc_size: number of documents to extract
    :type inc_size: int
    :param non_nlp_features_to_use: list of non-NLP features to add into
                                    the feature dictionaries 
    :type non_nlp_features_to_use: list of str
    :returns: list of dict
    '''

    if inc_size < 1:
        raise ValueError('inc_size parameter should be positive integer')
    data = []
    i = 0
    while i < inc_size:
        try:
            review_doc = next(training_data_cursor)
        except StopIteration:
            i = inc_size
            continue
        review_doc_get = review_doc.get
        # Make dictionary of features
        features = {feat: val for feat, val
                    in BSON.decode(review_doc_get('nlp_features')).items()
                    if val and val != float("NaN")}
        features.update({feat: review_doc_get(feat)
                         for feat in non_nlp_features_to_use
                         if feat != 'achievement_progress'
                            and review_doc_get(feat)
                            and review_doc_get(feat) != float("NaN")})
        # Add in the rest of the features
        if 'achievement_progress' in non_nlp_features_to_use:
            features.update({feat: val for feat, val
                             in review_doc_get('achievement_progress').items()
                             if val and val != float("NaN")})
        data.append(features)
        i += 1
    return data

In [13]:
# Get development data
#dev_data = get_dev_data(dev_cursor,
#                        500,
#                        non_nlp_feature_types + [hours_feature])
# Vectorize the data
#v_dev = DictVectorizer(sparse=True)
#X_dev = v_dev.fit_transform(dev_data)
# Make a K-means clusterer and fit it with the development data
#num_clusters = 10
#km = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)
#km.fit(X_dev)
#clusters = km.labels_.tolist()
#clusters[:10]
#Counter(clusters)
#km.cluster_centers_.argsort()
#labels = km.labels_
#cluster_centers = km.cluster_centers_
#km_labels_unique = np.unique(labels)
#pca = PCA(n_components=num_clusters).fit(X_dev.toarray())
#km_pca = KMeans(init=pca.components_, n_clusters=num_clusters, n_init=1)
#km_pca.fit(X_dev)
#km_pca.labels_

In [14]:
def get_train_data_iteration(training_data_cursor,
                             inc_size,
                             non_nlp_features_to_use,
                             y_feature):
    '''
    Get a list of training data dictionaries to use in model training.

    :param training_data_cursor: cursor for training documents
    :type training_data_cursor: pymongo.cursor.Cursor object
    :param inc_size: number of training documents to extract
    :type inc_size: int
    :param non_nlp_features_to_use: list of non-NLP features to add into
                                    the feature dictionaries 
    :type non_nlp_features_to_use: list of str
    :param y_feature: feature to use as the "score"
    :type y_feature: str
    :returns: list of dict
    '''

    if inc_size < 1:
        raise ValueError('inc_size parameter should be positive integer')
    if y_feature in non_nlp_features_to_use:
        raise Exception('y_feature must be a feature that is not in the '
                        'non_nlp_features_to_use list')
    data = []
    i = 0
    while i < inc_size:
        try:
            review_doc = next(training_data_cursor)
        except StopIteration:
            i = inc_size
            continue
        review_doc_get = review_doc.get
        # Make dictionary of features
        features = {feat: val for feat, val
                    in BSON.decode(review_doc_get('nlp_features')).items()
                    if val and val != float("NaN")}
        features.update({feat: review_doc_get(feat)
                         for feat in non_nlp_features_to_use
                         if feat != 'achievement_progress'
                            and review_doc_get(feat)
                            and review_doc_get(feat) != float("NaN")})
        # Add in the rest of the features
        if 'achievement_progress' in non_nlp_features_to_use:
            features.update({feat: val for feat, val
                             in review_doc_get('achievement_progress').items()
                             if val and val != float("NaN")})
        data.append(dict(y=review_doc_get(y_feature),
                         id=review_doc_get(_id),
                         x=features))
        i += 1
    return data

In [15]:
def get_test_data(test_data_cursor,
                  non_nlp_features_to_use,
                  y_feature):
    '''
    Get a list of test data dictionaries to use in model testing.

    :param test_data_cursor: cursor for test documents
    :type test_data_cursor: pymongo.cursor.Cursor object
    :param non_nlp_features_to_use: list of non-NLP features to add into
                                    the feature dictionaries 
    :type non_nlp_features_to_use: list of str
    :param y_feature: feature to use as the "score"
    :type y_feature: str
    :returns: list of dict
    '''

    if y_feature in non_nlp_features_to_use:
        raise Exception('y_feature must be a feature that is not in the '
                        'non_nlp_features_to_use list')
    data = []
    for review_doc in test_data_cursor:
        review_doc_get = review_doc.get
        # Make dictionary of features
        features = {feat: val for feat, val
                    in BSON.decode(review_doc_get('nlp_features')).items()
                    if val and val != float("NaN")}
        features.update({feat: review_doc_get(feat)
                         for feat in non_nlp_features_to_use
                         if feat != 'achievement_progress'
                            and review_doc_get(feat)
                            and review_doc_get(feat) != float("NaN")})
        # Add in the rest of the features
        if 'achievement_progress' in non_nlp_features_to_use:
            features.update({feat: val for feat, val
                             in review_doc_get('achievement_progress').items()
                             if val and val != float("NaN")})
        data.append(dict(y=review_doc_get(y_feature),
                         id=review_doc_get(_id),
                         x=features))
    return data

In [16]:
def get_stats(_y_test, _y_preds):
    """
    Get some statistics about the model's performance.

    :param _y_test: test set values
    :type _y_test: np.array
    :param _y_train: predictions
    :type _y_train: np.array
    :returns: tuple
    """

    _pearsonr = pearsonr(y_test,
                           y_train_preds_1)
    _r2 = r2_score(_y_test,
                   _y_preds)
    _prec_micro = precision_score(_y_test,
                                  _y_preds,
                                  labels=classes,
                                  average='micro')
    _prec_macro = precision_score(_y_test,
                                  _y_preds,
                                  labels=classes,
                                  average='macro')
    _prec_weighted = precision_score(_y_test,
                                     _y_preds,
                                     labels=classes,
                                     average='weighted')
    _f1_micro = f1_score(_y_test,
                         _y_preds,
                         labels=classes,
                         average='micro')
    _f1_macro = f1_score(_y_test,
                         _y_preds,
                         labels=classes,
                         average='macro')
    _f1_weighted = f1_score(_y_test,
                            _y_preds,
                            labels=classes,
                            average='weighted')
    _acc = accuracy_score(_y_test,
                          _y_preds,
                          normalize=True)
    _conf_mat = confusion_matrix(_y_test,
                                 _y_preds,
                                 labels=classes)
    
    return (_pearsonr,
            _r2,
            _prec_micro,
            _prec_macro,
            _prec_weighted,
            _f1_micro,
            _f1_macro,
            _f1_weighted,
            _acc,
            _conf_mat)

In [17]:
def make_printable_confusion_matrix(y_true, y_pred):
    '''
    Produce a printable confusion matrix to use in the evaluation
    report.

    :param y_true: array-like of actual labels
    :type y_true: array-like
    :param y_pred: array-like of predicted labels
    :type y_pred: array-like
    :returns: str
    '''

    labels = [int(x) for x in sorted(or_(set(y_true),
                                         set(np.round(y_pred))))]
    conf_mat = confusion_matrix(y_true,
                                np.round(y_pred),
                                labels=labels).tolist()
    res = ('confusion_matrix (rounded predictions) (row=human, '
           'col=machine, labels={}):\n'
           .format(labels))
    res = '{}{}\n'.format(res,
                          '\t'.join([''] + [str(x) for x in labels]))
    for row, label in zip(conf_mat,
                          labels):
        res = '{}{}\n'.format(res,
                              '\t'.join([str(x) for x in [label] + row]))
    return res

In [None]:
test_data = get_test_data(test_cursor,
                          non_nlp_feature_types,
                          hours_feature)
# Sample test data point
print('id: ' + test_data[0]['id'])
print('total hours played value (binned): ' + str(test_data[0]['y']))
print('sample of features for review/reviewer: '
      + str(dict(list(test_data[0]['x'].items())[:100])))
test_ids = np.array([_data['id'] for _data in test_data])
test_ids[:10]
y_test = np.array([_data['y'] for _data in test_data])
print('y_test sample: {}'.format(y_test[:10]))
test_feature_dicts = [_data['x'] for _data in test_data]
print('test features sample: {}'.format(dict(list(test_feature_dicts[0].items())[:50])))

In [None]:
# Make vectorizer and Perceptron learner
v = DictVectorizer(sparse=True)
learner = Perceptron()

In [None]:
# Example of ParameterGrid
list(ParameterGrid(_DEFAULT_PARAM_GRIDS[Perceptron]))[:3]

In [None]:
class IncrementalLearning:
    '''
    Class for conducting incremental learning experiments with a
    parameter grid and a learner.
    '''

    def __init__(self, learner, param_grid: dict,
                 training_data_cursor: pymongo.cursor.Cursor,
                 test_data_cursor: pymongo.cursor.Cursor, round_size: int,
                 non_nlp_features_to_use: list, prediction_label: str,
                 test_data: list, rounds=0, feats_to_exclude=[]):
        '''
        Initialize class.

        :param learner: regression algorithm to use for learning
        :type learner: learner type
        :param param_grid: dictionary of parameters mapped to lists
                           of values
        :type param_grid: dict
        :param training_data_cursor: cursor for training documents
        :type training_data_cursor: pymongo.cursor.Cursor object
        :param test_data_cursor: cursor for test documents
        :type test_data_cursor: pymongo.cursor.Cursor object
        :param round_size: number of training documents to extract in
                           each round
        :type round_size: int
        :param non_nlp_features: list of non-NLP features to add into
                                 the feature dictionaries 
        :type non_nlp_features: list of str
        :param prediction_label: feature to predict
        :type prediction_label: str
        :param test_data: list of test samples
        :type test_data: list of dict
        :param rounds: number of rounds of learning (0 for as many as
                       possible)
        :type rounds: int
        :param feats_to_exclude: list of features to exclude
        :type feats_to_exclude: list of str
        :returns: list of dict
        '''

        # Make sure parameters make some sense
        if round_size < 1:
            raise ValueError('The round_size parameter should have a positive'
                             ' value.')
        if prediction_label in non_nlp_features:
            raise Exception('The prediction_label parameter ({}) cannot also '
                            'be in the list of non-NLP features:\n\n{}\n.'
                            .format(prediction_label,
                                    non_nlp_features))

        # Constants
        self.nlp_feats = 'nlp_features'
        self.achieve_prog = 'achievement_progress'
        self.nan = float("NaN")
        self._y = 'y'
        self._id = 'id_string'
        self._x = 'x'

        # Learner-related variables
        self.vec = None
        self.params_list = list(ParameterGrid(param_grid))
        self.learner_name = learner.__module__.rsplit('.', 1)[-1]
        self.learners = [learner(**kwparams) for kwparams in self.params_list]
        self.learner_stats = [[] for _ in self.learners]

        # Information about what features to use for what purposes
        self.non_nlp_features = non_nlp_features
        self.feats_to_exclude = feats_to_exclude
        self.prediction_label = prediction_label

        # Information about incremental learning
        self.round_size = round_size
        self.rounds = rounds
        self.round = 1
        self.NO_MORE_TRAINING_DATA = False

        # Training/test data variables
        self.training_cursor = training_data_cursor
        self.test_cursor = test_data_cursor
        self.test_data = self.get_test_data()
        self.test_ids = [_data[self._id] for _data in self.test_data]
        self.test_feature_dicts = [_data[self._x] for _data in self.test_data]
        self.y_test = np.array([_data[self._y] for _data in self.test_data])
        self.classes = np.unique(self.y_test)

        # Useful constants for use in make_printable_confusion_matrix
        self.tab_join = '\t'.join
        self.conf_mat_row_format = '{}{}\n'.format
        classes_string_list = [''] + [str(x) for x in self.classes]
        self.conf_mat_desc = \
            self.conf_mat_row_format('confusion_matrix (rounded predictions) '
                                     '(row=human, col=machine, labels={}):\n'
                                     .format(self.classes),
                                     self.tab_join(classes_string_list))

        # Do incremental learning experiments
        self.do_learning_rounds()
        self.learner_stats = [pd.DataFrame(learner_stats) for learner_stats
                              in self.learner_stats]

    def get_stats(self, y_pred):
        """
        Get some statistics about the model's performance on the test
        set.

        :param y_pred: predictions
        :type y_pred: np.array
        :returns: dict
        """

        return {'pearson_r': pearsonr(self.y_test,
                                      y_pred),
                'r2': r2_score(self.y_test,
                               y_pred),
                'precision_micro': precision_score(self.y_test,
                                                   y_pred,
                                                   labels=self.classes,
                                                   average='micro'),
                'precision_macro': precision_score(self.y_test,
                                                   y_pred,
                                                   labels=self.classes,
                                                   average='macro'),
                'precision_weighted': precision_score(self.y_test,
                                                      y_pred,
                                                      labels=self.classes,
                                                      average='weighted'),
                'f1_micro': f1_score(self.y_test,
                                     y_pred,
                                     labels=self.classes,
                                     average='micro'),
                'f1_macro': f1_score(self.y_test,
                                     y_pred,
                                     labels=self.classes,
                                     average='macro'),
                'f1_weighted': f1_score(self.y_test,
                                        y_pred,
                                        labels=self.classes,
                                        average='weighted'),
                'accuracy': accuracy_score(self.y_test,
                                           y_pred,
                                           normalize=True),
                'confusion_matrix': confusion_matrix(self.y_test,
                                                     y_pred,
                                                     labels=self.classes),
                'printable_confusion_matrix':
                    self.make_printable_confusion_matrix(y_pred),
                'uwk': kappa(self.y_test,
                             y_pred),
                'uwk_off_by_one': kappa(self.y_test,
                                        y_pred,
                                        allow_off_by_one=True),
                'qwk': kappa(self.y_test,
                             y_pred,
                             weights='quadratic'),
                'qwk_off_by_one': kappa(self.y_test,
                                        y_pred,
                                        weights='quadratic',
                                        allow_off_by_one=True),
                'lwk': kappa(self.y_test,
                             y_pred,
                             weights='linear'),
                'lwk_off_by_one': kappa(self.y_test,
                                        y_pred,
                                        weights='linear',
                                        allow_off_by_one=True)}

    def make_printable_confusion_matrix(self, y_pred):
        '''
        Produce a printable confusion matrix to use in the evaluation
        report.

        :param y_pred: array-like of predicted labels
        :type y_pred: array-like
        :returns: str
        '''

        conf_mat = confusion_matrix(self.y_test,
                                    np.round(y_pred),
                                    labels=self.classes).tolist()
        res = str(self.conf_mat_desc)
        for row, label in zip(conf_mat,
                              self.classes):
            res = self.conf_mat_row_format(res,
                                           self.tab_join([str(x) for x
                                                          in [label] + row]))
        return res

    def get_train_data_iteration(self):
        '''
        Get a list of training data dictionaries to use in model
        training.

        :returns: list of dict
        '''

        data = []
        i = 0
        while i < self.round_size:
            try:
                review_doc = next(self.training_cursor)
            except StopIteration:
                self.NO_MORE_TRAINING_DATA = True
                break
            review_doc_get = review_doc.get
            # Make dictionary of features
            features = {feat: val for feat, val
                        in BSON.decode(review_doc_get(self.nlp_feats)).items()
                        if val and val != self.nan}
            features.update({feat: review_doc_get(feat)
                             for feat in self.non_nlp_features
                             if not feat in self.feats_to_exclude
                                and review_doc_get(feat)
                                and review_doc_get(feat) != self.nan})
            # Add in the rest of the features
            if self.achieve_prog in self.non_nlp_features:
                features.update({feat: val for feat, val
                                 in review_doc_get(self.achieve_prog).items()
                                 if feat != self.prediction_label
                                    not feat in self.feats_to_exclude
                                    and val
                                    and val != self.nan})
            # Get the value of the y feature and skip if there is no
            # value or the value is equal to None
            if self.prediction_label in review_doc.keys():
                y_value = review_doc_get(self.prediction_label)
            elif self.prediction_label in review_doc.get(self.achieve_prog):
                y_value = (review_doc_get(self.achieve_prog)
                           .get(self.prediction_label))
            else:
                y_value = None
            if y_value == None:
                i += 1
                continue
            data.append(dict(y=y_value,
                             id=review_doc_get(self._id),
                             x=features))
            i += 1
        return data

    def get_test_data(self):
        '''
        Get a list of test data dictionaries to use in model
        evaluation.

        :returns: list of dict
        '''

        data = []
        for review_doc in self.test_cursor:
            review_doc_get = review_doc.get
            # Make dictionary of features
            features = {feat: val for feat, val
                        in BSON.decode(review_doc_get(self.nlp_feats)).items()
                        if val and val != self.nan}
            features.update({feat: review_doc_get(feat)
                             for feat in self.non_nlp_features
                             if not feat in self.feats_to_exclude
                                and review_doc_get(feat)
                                and review_doc_get(feat) != self.nan})
            # Add in the rest of the features
            if self.achieve_prog in self.non_nlp_features:
                features.update({feat: val for feat, val
                                 in review_doc_get(self.achieve_prog).items()
                                 if feat != self.prediction_label
                                    not feat in self.feats_to_exclude
                                    and val
                                    and val != self.nan})
            # Get the value of the y feature and skip if there is no
            # value or the value is equal to None
            if self.prediction_label in review_doc.keys():
                y_value = review_doc_get(self.prediction_label)
            elif self.prediction_label in review_doc.get(self.achieve_prog):
                y_value = (review_doc_get(self.achieve_prog)
                           .get(self.prediction_label))
            else:
                y_value = None
            if y_value == None:
                continue
            data.append(dict(y=y_value,
                             id=review_doc_get(self._id),
                             x=features))
        return data

    def learning_round(self):
        '''
        Do learning rounds.
        '''

        # Get some training data
        train_data = self.get_train_data_iteration()
        samples = len(train_data)
        if not samples:
            return
        train_ids = np.array([_data[self._id] for _data in train_data])
        y_train = np.array([_data[self._y] for _data in train_data])
        train_feature_dicts = [_data[self._x] for _data in train_data]

        # Set _vec if not already set and fit it it the training
        # features, which will only need to be done the first time
        if self.vec == None:
            self.vec = DictVectorizer(sparse=True)
            X_train = self.vec.fit_transform(train_feature_dicts)
        else:
            X_train = self.vec.transform(train_feature_dicts)

        # Transform the test features
        X_test = self.vec.transform(self.test_feature_dicts)

        # Update the various models with differing parameters
        for i, (learner, params) in enumerate(zip(self.learners,
                                                  self.params_list)):
            learner.partial_fit(X_train,
                                y_train,
                                classes=self.classes)
            y_test_preds = learner.predict(X_test)
            stats_dict = self.get_stats(y_test_preds)
            stats_dict.update({'learning_round': int(self.round),
                               'prediction_label': self.prediction_label,
                               'test_set_labels/test_set_predictions':
                                   list(zip(self.y_test,
                                            y_test_preds)),
                               'learner': self.learner_name,
                               'params': params,
                               'training_samples': samples})
            self.learner_stats[i].append(pd.Series(stats_dict))

        # Increment the round number
        self.round += 1

    def do_learning_rounds(self):
        '''
        Do rounds of learning.
        '''

        if self.rounds > 0:
            while self.round <= self.rounds:
                if self.NO_MORE_TRAINING_DATA:
                    break
                else:
                    self.learning_round()
        else:
            while True:
                if self.NO_MORE_TRAINING_DATA:
                    break
                self.learning_round()

## First Round of Learning

### Vectorize Features
- Vectorize the test set features
- Vectorize a small portion of the training features (the first 100) and all of the test features, partially train the model, and then repeat

In [None]:
train_data_1 = get_train_data_iteration(train_cursor,
                                        100,
                                        non_nlp_feature_types,
                                        hours_feature)

In [None]:
# Sample training data point
print('id: ' + train_data_1[0]['id'])
print('total hours played value (binned): ' + str(train_data_1[0]['y']))
print('sample of features for review/reviewer: '
      + str(dict(list(train_data_1[0]['x'].items())[:100])))

In [None]:
train_ids_1 = np.array([_data['id'] for _data in train_data_1])
y_train_1 = np.array([_data['y'] for _data in train_data_1])
train_feature_dicts_1 = [_data['x'] for _data in train_data_1]
classes = np.unique(y_train_1)

In [None]:
X_train_1 = v.fit_transform(train_feature_dicts_1)
X_test = v.transform(test_feature_dicts)

In [None]:
learner.get_params()

In [None]:
learner.partial_fit(X_train_1,
                    y_train_1,
                    classes=classes)

In [None]:
y_train_preds_1 = learner.predict(X_test)
[(y, y_pred) for y, y_pred in zip(y_test,
                                  y_train_preds_1)]

In [None]:
(pearsonr_1,
 r2_1,
 prec_micro_1,
 prec_macro_1,
 prec_weighted_1,
 f1_micro_1,
 f1_macro_1,
 f1_weighted_1,
 acc_1,
 conf_mat_1) = get_stats(y_test,
                         y_train_preds_1)

In [None]:
print('Some stats\n\n')
print('Pearson: {} (p = {})'.format(*pearsonr_1))
print('r2 score: {}'.format(r2_1))
print('Micro precision score: {}'.format(prec_micro_1))
print('Macro precision score: {}'.format(prec_macro_1))
print('Weighted precision score: {}'.format(prec_weighted_1))
print('Micro f1 score: {}'.format(f1_micro_1))
print('Macro f1 score: {}'.format(f1_macro_1))
print('Weighted f1 score: {}'.format(f1_weighted_1))
print('Accuracy score: {}'.format(acc_1))
print('\nConfusion matrix:\n\n{}'.format(conf_mat_1))

## Second Round of Learning

In [None]:
train_data_2 = get_train_data_iteration(train_cursor,
                                        100,
                                        non_nlp_feature_types,
                                        hours_feature)
# Sample training data point
print('id: ' + train_data_2[0]['id'])
print('total hours played value (binned): ' + str(train_data_2[0]['y']))
print('sample of features for review/reviewer: '
      + str(dict(list(train_data_2[0]['x'].items())[:100])))

In [None]:
train_ids_2 = np.array([_data['id'] for _data in train_data_2])
y_train_2 = np.array([_data['y'] for _data in train_data_2])
train_feature_dicts_2 = [_data['x'] for _data in train_data_2]

In [None]:
X_train_2 = v.transform(train_feature_dicts_2)

In [None]:
X_test = v.transform(test_feature_dicts)

In [None]:
learner.partial_fit(X_train_2,
                    y_train_2)

In [None]:
y_train_preds_2 = learner.predict(X_test)
[(y, y_pred) for y, y_pred in zip(y_test,
                                  y_train_preds_2)]

In [None]:
(pearsonr_2,
 r2_2,
 prec_micro_2,
 prec_macro_2,
 prec_weighted_2,
 f1_micro_2,
 f1_macro_2,
 f1_weighted_2,
 acc_2,
 conf_mat_2) = get_stats(y_test,
                         y_train_preds_2)
print('Some stats\n\n')
print('Pearson: {} (p = {})'.format(*pearsonr_2))
print('r2 score: {}'.format(r2_2))
print('Micro precision score: {}'.format(prec_micro_2))
print('Macro precision score: {}'.format(prec_macro_2))
print('Weighted precision score: {}'.format(prec_weighted_2))
print('Micro f1 score: {}'.format(f1_micro_2))
print('Macro f1 score: {}'.format(f1_macro_2))
print('Weighted f1 score: {}'.format(f1_weighted_2))
print('Accuracy score: {}'.format(acc_2))
print('\nConfusion matrix:\n\n{}'.format(conf_mat_2))

## Third Round of Learning

In [None]:
train_data_3 = get_train_data_iteration(train_cursor,
                                        100,
                                        non_nlp_feature_types,
                                        hours_feature)
print('id: ' + train_data_3[0]['id'])
print('total hours played value (binned): ' + str(train_data_3[0]['y']))
print('sample of features for review/reviewer: '
      + str(dict(list(train_data_3[0]['x'].items())[:100])))

In [None]:
train_ids_3 = np.array([_data['id'] for _data in train_data_3])
y_train_3 = np.array([_data['y'] for _data in train_data_3])
train_feature_dicts_3 = [_data['x'] for _data in train_data_3]

In [None]:
X_train_3 = v.transform(train_feature_dicts_3)

In [None]:
X_test = v.transform(test_feature_dicts)

In [None]:
learner.partial_fit(X_train_3,
                    y_train_3)

In [None]:
y_train_preds_3 = learner.predict(X_test)
[(y, y_pred) for y, y_pred in zip(y_test,
                                  y_train_preds_3)]

In [None]:
(pearsonr_3,
 r2_3,
 prec_micro_3,
 prec_macro_3,
 prec_weighted_3,
 f1_micro_3,
 f1_macro_3,
 f1_weighted_3,
 acc_3,
 conf_mat_3) = get_stats(y_test,
                         y_train_preds_3)
print('Some stats\n\n')
print('Pearson: {} (p = {})'.format(*pearsonr_3))
print('r2 score: {}'.format(r2_3))
print('Micro precision score: {}'.format(prec_micro_3))
print('Macro precision score: {}'.format(prec_macro_3))
print('Weighted precision score: {}'.format(prec_weighted_3))
print('Micro f1 score: {}'.format(f1_micro_3))
print('Macro f1 score: {}'.format(f1_macro_3))
print('Weighted f1 score: {}'.format(f1_weighted_3))
print('Accuracy score: {}'.format(acc_3))
print('\nConfusion matrix:\n\n{}'.format(conf_mat_3))

## Fourth Round of Learning

In [None]:
train_data_4 = get_train_data_iteration(train_cursor,
                                        100,
                                        non_nlp_feature_types,
                                        hours_feature)
print('id: ' + train_data_4[0]['id'])
print('total hours played value (binned): ' + str(train_data_4[0]['y']))
print('sample of features for review/reviewer: '
      + str(dict(list(train_data_4[0]['x'].items())[:100])))

In [None]:
train_ids_4 = np.array([_data['id'] for _data in train_data_4])
y_train_4 = np.array([_data['y'] for _data in train_data_4])
train_feature_dicts_4 = [_data['x'] for _data in train_data_4]

In [None]:
X_train_4 = v.transform(train_feature_dicts_4)

In [None]:
X_test = v.transform(test_feature_dicts)

In [None]:
learner.partial_fit(X_train_4,
                    y_train_4)

In [None]:
y_train_preds_4 = learner.predict(X_test)
[(y, y_pred) for y, y_pred in zip(y_test,
                                  y_train_preds_4)]

In [None]:
(pearsonr_4,
 r2_4,
 prec_micro_4,
 prec_macro_4,
 prec_weighted_4,
 f1_micro_4,
 f1_macro_4,
 f1_weighted_4,
 acc_4,
 conf_mat_4) = get_stats(y_test,
                         y_train_preds_4)
print('Some stats\n\n')
print('Pearson: {} (p = {})'.format(*pearsonr_4))
print('r2 score: {}'.format(r2_4))
print('Micro precision score: {}'.format(prec_micro_4))
print('Macro precision score: {}'.format(prec_macro_4))
print('Weighted precision score: {}'.format(prec_weighted_4))
print('Micro f1 score: {}'.format(f1_micro_4))
print('Macro f1 score: {}'.format(f1_macro_4))
print('Weighted f1 score: {}'.format(f1_weighted_4))
print('Accuracy score: {}'.format(acc_4))
print('\nConfusion matrix:\n\n{}'.format(conf_mat_4))

## Fifth Round of Learning