In [1]:
import numpy as np
import pandas as pd
import re
import pyprind

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from nltk.corpus import stopwords

stop = stopwords.words('english')

In [2]:
RANDOM_SEED = 1
np.random.seed(0)
np.set_printoptions(precision=2)

In [3]:
data_file = '../data/train-balanced-sarcasm.csv'
data = pd.read_csv(data_file)
data.dropna(subset=['comment'], inplace=True)
data = data[['comment', 'label']]

In [4]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(data):
    for row in data.itertuples():
        text = row[1]
        label = row[2]
        yield text, label


In [5]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [6]:
vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

In [7]:
clf = SGDClassifier(loss='log', random_state=RANDOM_SEED, max_iter=1)

In [8]:
doc_stream = stream_docs(data=data)

In [9]:
pbar = pyprind.ProgBar(45)

In [10]:
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:02


In [11]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.389


In [12]:
clf = clf.partial_fit(X_test, y_test)