In [7]:
import re
import sys
import csv
import random

import numpy as np
import pandas as pd

import dask
import dask.dataframe as dd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.pipeline import Pipeline

from pathlib import Path

from tqdm import tqdm
from wordcloud import WordCloud, STOPWORDS

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter("ignore")

In [16]:
train_file = 'data/train_categorical_data.csv'
test_file = 'data/test_categorical_data.csv'

In [13]:
# Getting 20% from the news content in the test data
num_lines = 0
with open(test_file, 'r') as f:
    for line in f:
        num_lines += 1

print('Number of lines: {}'.format(num_lines))

s = int(round(num_lines * 0.2))

Number of lines: 1625303


In [14]:
skip = sorted(random.sample(range(1, num_lines + 1), num_lines - s)) 
test_df = pd.read_csv(test_file, skiprows=skip)

In [None]:
validation_scores = []
training_set_size = []

# Build the vectorizer and the classifier
h_vectorizer = HashingVectorizer(encoding='latin-1')
clf = PassiveAggressiveClassifier(C=1)

# Extract the features for the validation once and for all
text_validation = test_df['news_content'].tolist()
target_validation = test_df['category'].tolist()

X_validation = h_vectorizer.transform(text_validation)
classes = np.arange(12) 

n_samples = 0
chunksize = 100
reader = pd.read_csv(train_file, chunksize=chunksize)

i = 0
for df in reader:
    # Get batch
    texts_in_batch = df['news_content'].tolist()
    targets_in_batch = df['category'].tolist()

    n_samples += len(texts_in_batch)

    # Vectorize the text in the batch
    X_batch = h_vectorizer.transform(texts_in_batch)

    # Incrementally train the model on the new batch
    clf.partial_fit(X_batch, targets_in_batch, classes=classes)

    if n_samples % 100 == 0:
        # Compute the validation score of the current state of the model
        score = clf.score(X_validation, target_validation)
        validation_scores.append(score)
        training_set_size.append(n_samples)

    if i % 100 == 0:
        print("n_samples: {0}, score: {1:.4f}".format(n_samples, score))

    i += 1

n_samples: 100, score: 0.2019
n_samples: 10100, score: 0.2109
n_samples: 20100, score: 0.2205
n_samples: 30100, score: 0.2867
n_samples: 40100, score: 0.3091
n_samples: 50100, score: 0.2745
n_samples: 60100, score: 0.3356
n_samples: 70100, score: 0.3087
n_samples: 80100, score: 0.2987
n_samples: 90100, score: 0.2819
n_samples: 100100, score: 0.3144
n_samples: 110100, score: 0.2986
n_samples: 120100, score: 0.3180
n_samples: 130100, score: 0.3252
n_samples: 140100, score: 0.3296
n_samples: 150100, score: 0.3525
n_samples: 160100, score: 0.3531
n_samples: 170100, score: 0.3441
n_samples: 180100, score: 0.3484
n_samples: 190100, score: 0.3033
n_samples: 200100, score: 0.3575
n_samples: 210100, score: 0.3615
n_samples: 220100, score: 0.3645
n_samples: 230100, score: 0.3801
n_samples: 240100, score: 0.3742
n_samples: 250100, score: 0.3753
n_samples: 260100, score: 0.3381
n_samples: 270100, score: 0.3455
n_samples: 280100, score: 0.4107
n_samples: 290100, score: 0.4139
n_samples: 300100, sco

n_samples: 2450100, score: 0.3963
n_samples: 2460100, score: 0.3915
n_samples: 2470100, score: 0.4545
n_samples: 2480100, score: 0.4471
n_samples: 2490100, score: 0.4292
n_samples: 2500100, score: 0.3918
n_samples: 2510100, score: 0.4459
n_samples: 2520100, score: 0.4098
n_samples: 2530100, score: 0.4288
n_samples: 2540100, score: 0.4313
n_samples: 2550100, score: 0.4224
n_samples: 2560100, score: 0.4171
n_samples: 2570100, score: 0.4121
n_samples: 2580100, score: 0.4159
n_samples: 2590100, score: 0.4171
n_samples: 2600100, score: 0.4279
n_samples: 2610100, score: 0.4259
n_samples: 2620100, score: 0.4393
n_samples: 2630100, score: 0.4470
n_samples: 2640100, score: 0.4543
n_samples: 2650100, score: 0.4348
n_samples: 2660100, score: 0.4531
n_samples: 2670100, score: 0.4599
n_samples: 2680100, score: 0.4695
n_samples: 2690100, score: 0.4592
n_samples: 2700100, score: 0.4536
n_samples: 2710100, score: 0.4623
n_samples: 2720100, score: 0.4649
n_samples: 2730100, score: 0.4625
n_samples: 274

In [None]:
plt.plot(training_set_size, validation_scores)
plt.xlabel("Number of samples")
plt.ylabel("Validation score")
plt.title('Progression of validation score with number of samples');