# Incremental Learning with scikit-learn

In [3]:
import numpy as np
from json import loads
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_selection import (chi2,
                                       SelectKBest)
from sklearn.naive_bayes import (BernoulliNB,
                                 MultinomialNB)
from sklearn.linear_model import (Perceptron,
                                  SGDRegressor,
                                  SGDClassifier,
                                  PassiveAggressiveRegressor,
                                  PassiveAggressiveClassifier)
from sklearn.decomposition import (IncrementalPCA,
                                   MiniBatchDictionaryLearning)
from sklearn.feature_extraction import DictVectorizer

In [6]:
! wc -l ../working/*

      1293 ../working/Arma_3.jsonlines
      3996 ../working/Counter_Strike.jsonlines
       705 ../working/Counter_Strike_Global_Offensive.jsonlines
       723 ../working/Dota_2.jsonlines
       950 ../working/Football_Manager_2015.jsonlines
       812 ../working/Garrys_Mod.jsonlines
       751 ../working/Grand_Theft_Auto_V.jsonlines
       706 ../working/Sid_Meiers_Civilization_5.jsonlines
       614 ../working/Team_Fortress_2.jsonlines
       625 ../working/The_Elder_Scrolls_V.jsonlines
       520 ../working/Warframe.jsonlines
     11695 total


In [7]:
# Open a feature file and play around with it
with open('../working/Counter_Strike.jsonlines') as f:
    data = [loads(_data) for _data in f.readlines()]

In [8]:
len(data)

3996

In [9]:
data[0].keys()

dict_keys(['x', 'id', 'y'])

In [10]:
# Split into training and test sets
train_data = data[:3000]
test_data = data[3000:]

In [11]:
train_ids = np.array([_data['id'] for _data in train_data])
test_ids = np.array([_data['id'] for _data in test_data])
test_ids[:10]

array(['4443676253924574002', '5221888560152401891', '5708178163360556957',
       '788745930948480249', '8472771801387632108', '7843154417522016335',
       '6639848025721547877', '6559516706440854906', '3385911070765323467',
       '1679667525108150455'], 
      dtype='<U19')

In [12]:
y_train = np.array([_data['y'] for _data in train_data])
y_test = np.array([_data['y'] for _data in test_data])
y_test[:10]

array([2, 1, 1, 1, 1, 1, 1, 2, 1, 1])

In [13]:
train_feature_dicts = [_data['x'] for _data in train_data]
test_feature_dicts = [_data['x'] for _data in test_data]
list(test_feature_dicts[0].items())[:10]

[('bind f', 1),
 ('fun playing', 1),
 ('idk why', 1),
 ('overlay .', 1),
 ('shooters )', 1),
 ('play this', 1),
 ('constantly', 1),
 ('klubpenguin1234abc :', 1),
 ('okay it', 1),
 ('wonderful', 1)]

## First Round of Learning

### Vectorize Features
- Vectorize the test set features
- Vectorize a small portion of the training features (the first 50), partially train the model, and then repeat

In [17]:
train_ids_1 = train_ids[:1000]
y_train_1 = y_train[:1000]
train_feature_dicts_1 = train_feature_dicts[:1000]

In [18]:
y_train_1.shape

(1000,)

In [19]:
classes = np.unique(y_train_1)
classes

array([1, 2, 3, 4])

In [41]:
v = DictVectorizer(sparse=True)

In [42]:
X_train_1 = v.fit_transform(train_feature_dicts_1)

In [21]:
v.vocabulary_

{'HUHHU': 23087,
 'r 3': 92713,
 'S in': 27665,
 'nuuu': 83271,
 'er. T': 53276,
 'yed t': 117225,
 'ag1c': 32061,
 'Alp': 19907,
 'oood ': 87198,
 'know ?': 71567,
 'influence on': 68223,
 'is..': 69328,
 'hardg': 62899,
 ' CS': 2066,
 'atte': 36029,
 'in:VMOD:2014': 67920,
 'map:ROOT:pro': 76626,
 'f l': 55508,
 'P M': 26504,
 'en be': 52244,
 'and equipments': 33787,
 "'s do": 9996,
 'at ea': 35652,
 'ry ve': 97123,
 'ng 8/': 81427,
 'olds ': 85734,
 'defu': 46895,
 ' (BT': 516,
 'r P': 92809,
 'S we': 27723,
 'ccla': 41219,
 "IX'": 23928,
 'Ri': 27441,
 'it ha': 69670,
 '(pr': 10881,
 ' So i': 3765,
 'S-st': 27802,
 'about pc': 31144,
 'f my': 55531,
 'be:VMOD:anything': 38163,
 'be:VC:make': 38092,
 'entertaining:AMOD:exciting': 52773,
 'Btw': 20254,
 'healthy': 63957,
 'developers': 47167,
 'tyl': 109524,
 'mpara': 78591,
 "any crap.don't": 34305,
 'check:ROOT:now': 41847,
 '_in': 30199,
 'to sl': 108279,
 'start:VMOD:as': 102408,
 'Legen': 24722,
 'nt: ': 83017,
 'lex w': 73554,

In [23]:
len(v.vocabulary_)

118046

In [25]:
type(X_train_1)

scipy.sparse.csr.csr_matrix

In [26]:
X_train_1.shape

(1000, 118046)

In [43]:
X_test = v.transform(test_feature_dicts)

In [49]:
X_test.shape

(996, 118046)

In [28]:
# Select 1000 features using K-best
#ch2 = SelectKBest(chi2, 1000)
#X_train_1 = ch2.fit_transform(X_train_1,
#                              y_train_1)
#X_test = ch2.fit_transform(X_test,
#                           y_test)

In [44]:
# Make SGDRegressor learner
minibatch_kmeans_clf = MiniBatchKMeans()

In [45]:
minibatch_kmeans_clf.get_params()

{'batch_size': 100,
 'compute_labels': True,
 'init': 'k-means++',
 'init_size': None,
 'max_iter': 100,
 'max_no_improvement': 10,
 'n_clusters': 8,
 'n_init': 3,
 'random_state': None,
 'reassignment_ratio': 0.01,
 'tol': 0.0,
 'verbose': 0}

In [46]:
minibatch_kmeans_clf = minibatch_kmeans_clf.partial_fit(X_train_1,
                                                        y_train_1)

In [47]:
y_train_1_preds = minibatch_kmeans_clf.predict(X_test)

In [50]:
[(y, y_pred) for y, y_pred in zip(y_train_1,
                                  y_train_1_preds)]

[(1, 5),
 (1, 5),
 (1, 5),
 (1, 5),
 (2, 4),
 (1, 1),
 (1, 5),
 (1, 5),
 (2, 5),
 (1, 1),
 (1, 5),
 (1, 5),
 (1, 1),
 (2, 5),
 (1, 5),
 (1, 0),
 (1, 5),
 (3, 5),
 (3, 4),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (4, 5),
 (1, 1),
 (1, 5),
 (2, 4),
 (1, 5),
 (1, 5),
 (1, 1),
 (1, 5),
 (1, 5),
 (1, 5),
 (1, 5),
 (1, 4),
 (1, 7),
 (1, 1),
 (1, 5),
 (1, 5),
 (1, 1),
 (1, 5),
 (1, 5),
 (1, 0),
 (1, 5),
 (1, 5),
 (1, 5),
 (2, 1),
 (2, 1),
 (1, 5),
 (2, 5),
 (2, 1),
 (1, 1),
 (2, 4),
 (1, 7),
 (1, 4),
 (1, 5),
 (1, 7),
 (3, 4),
 (1, 5),
 (1, 1),
 (2, 5),
 (1, 5),
 (3, 5),
 (3, 5),
 (1, 2),
 (1, 5),
 (1, 7),
 (3, 5),
 (1, 5),
 (1, 1),
 (1, 1),
 (1, 5),
 (4, 5),
 (1, 5),
 (1, 5),
 (1, 5),
 (1, 5),
 (3, 5),
 (1, 5),
 (1, 5),
 (1, 5),
 (1, 1),
 (3, 5),
 (1, 2),
 (1, 5),
 (3, 5),
 (1, 0),
 (2, 1),
 (1, 5),
 (1, 5),
 (1, 5),
 (3, 5),
 (1, 1),
 (1, 5),
 (2, 5),
 (1, 5),
 (1, 5),
 (1, 5),
 (1, 4),
 (1, 5),
 (1, 5),
 (1, 6),
 (1, 1),
 (1, 4),
 (1, 4),
 (1, 4),
 (3, 5),
 (3, 2),
 (1, 1),
 (2, 4),
 (1, 1),
 

## Second Round of Learning

In [229]:
train_ids_2 = train_ids[100:500]
y_2 = train_labels[100:500]
train_feature_dicts_2 = train_feature_dicts[100:500]

In [230]:
X_train_2 = v.fit_transform(train_feature_dicts_2)

In [231]:
X_test = v.fit_transform(test_feature_dicts)

In [233]:
# Select 1000 features using K-best
ch2 = SelectKBest(chi2, 1000)
X_train_2 = ch2.fit_transform(X_train_2,
                              y_2)
X_test = ch2.fit_transform(X_test,
                           test_labels)

In [234]:
sgd_reg = sgd_reg.partial_fit(X_train_2,
                              y_2)

In [235]:
y_2_preds = sgd_reg.predict(X_test)

In [236]:
[(y, y_pred) for y, y_pred in zip(test_labels,
                                  y_2_preds)]

[(2, 56886790.314545617),
 (1, -242415337788.38455),
 (1, -550413326121.13489),
 (1, -634923958223.9375),
 (1, -1410878925473.938),
 (1, -1131264076109.7231),
 (1, -246165261807.95328),
 (2, -408314895876.75403),
 (1, -683446634650.62024),
 (1, -1002306126554.3906),
 (1, -533146185945.24896),
 (1, -718814792106.73767),
 (1, -1193263659226.0879),
 (1, -699809177478.31567),
 (1, -1257246019504.7341),
 (1, -5007218969471.3789),
 (1, -56364512615.786591),
 (1, 110230490220.74554),
 (1, -2860289948488.3389),
 (1, -1976590946576.553),
 (3, -1209906780875.5093),
 (1, -1194752599237.002),
 (4, -909339879376.08911),
 (2, -15135347886.926998),
 (1, -1898294553641.2468),
 (1, -380853304840.60425),
 (1, -2134491405605.8557),
 (2, -173413318900.99435),
 (1, -776547017306.47449),
 (2, -1548412107959.1584),
 (1, -216624386574.30542),
 (1, 56886790.314545617),
 (1, -495909811838.69592),
 (1, -556731027609.12195),
 (1, -984786185028.6123),
 (1, -13933489296554.289),
 (2, -708121347122.18396),
 (1, -221

## Third Round of Learning

In [237]:
train_ids_3 = train_ids[500:1000]
y_3 = train_labels[500:1000]
train_feature_dicts_3 = train_feature_dicts[500:1000]

In [238]:
X_train_3 = v.transform(train_feature_dicts_3)

In [239]:
X_test = v.fit_transform(test_feature_dicts)

In [240]:
# Select 1000 features using K-best
ch2 = SelectKBest(chi2, 1000)
X_train_3 = ch2.fit_transform(X_train_3,
                              y_3)
X_test = ch2.fit_transform(X_test,
                           test_labels)

  chisq /= f_exp


In [241]:
sgd_reg = sgd_reg.partial_fit(X_train_3,
                              y_3)

In [242]:
y_3_preds = sgd_reg.predict(X_test)

In [243]:
[(y, y_pred) for y, y_pred in zip(test_labels,
                                  y_3_preds)]

[(2, 719644641.57894254),
 (1, -285433467743.84436),
 (1, -878462391668.88208),
 (1, -459406088213.92004),
 (1, -1983254606324.0962),
 (1, -1380487585307.3501),
 (1, -246744328118.24008),
 (2, -981082726417.18225),
 (1, -796335780327.78735),
 (1, -1520262914581.1023),
 (1, -810714233056.19128),
 (1, -1205277299620.9744),
 (1, -1549453545255.8691),
 (1, -999100660332.86487),
 (1, -2599977657016.7432),
 (1, -4565375827608.7344),
 (1, -61109349915.970848),
 (1, -276639587240.20148),
 (1, -3713101690441.0908),
 (1, -3356429368230.3984),
 (3, -1738947602561.7615),
 (1, -1868752580008.5647),
 (4, -1457707364918.948),
 (2, -23125826485.501469),
 (1, -2710371334617.4253),
 (1, -478240699358.82483),
 (1, -4242619403304.1763),
 (2, -410583442005.3158),
 (1, -1196048359200.281),
 (2, -2550958789964.9663),
 (1, -309659343659.30054),
 (1, 719644641.57894254),
 (1, -906569994745.47534),
 (1, -772086769666.61047),
 (1, -1586604365362.6633),
 (1, -19876130769011.883),
 (2, -1080085392994.0417),
 (1, -

## Fourth Round of Learning

In [244]:
train_ids_4 = train_ids[1000:2000]
y_4 = train_labels[1000:2000]
train_feature_dicts_4 = train_feature_dicts[1000:2000]

In [245]:
X_train_4 = v.transform(train_feature_dicts_4)

In [246]:
X_test = v.fit_transform(test_feature_dicts)

In [247]:
# Select 1000 features using K-best
ch2 = SelectKBest(chi2, 1000)
X_train_4 = ch2.fit_transform(X_train_4,
                              y_4)
X_test = ch2.fit_transform(X_test,
                           test_labels)

  chisq /= f_exp


In [248]:
sgd_reg = sgd_reg.partial_fit(X_train_4,
                              y_4)

In [249]:
y_4_preds = sgd_reg.predict(X_test)

In [250]:
[(y, y_pred) for y, y_pred in zip(test_labels,
                                  y_4_preds)]

[(2, 798889302.41719568),
 (1, -29676582836487.035),
 (1, 53366208297662.047),
 (1, -134704343342900.95),
 (1, 192501304232745.59),
 (1, -15909146813513.871),
 (1, -18306706273529.797),
 (2, -17974161629581.684),
 (1, 21994954771399.059),
 (1, 32699107841806.566),
 (1, -31965112878768.965),
 (1, 15759000333572.852),
 (1, 29574366533530.742),
 (1, 9252878055745.3613),
 (1, -35962043384823.0),
 (1, -499569704058839.81),
 (1, -9674243205765.7539),
 (1, 36030538836499.07),
 (1, 155135389316860.38),
 (1, -7659572383889.1982),
 (3, -16830145174544.154),
 (1, 31247442698424.137),
 (4, -20305081610067.461),
 (2, 1517806811617.0825),
 (1, -78632204890010.781),
 (1, 6094693798907.1309),
 (1, 134586159827339.3),
 (2, -47263801830158.008),
 (1, 17200476634009.098),
 (2, 194770371683233.47),
 (1, -15967470990565.178),
 (1, 798889302.41719568),
 (1, -55749889978802.109),
 (1, -9296499154241.0938),
 (1, 195816021857171.22),
 (1, 2088875086505295.8),
 (2, 486886852832.54877),
 (1, -121593931253651.17)