In [1]:
import argparse
from base64 import b64decode

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

import csv

In [2]:
class DocItem:
    def __init__(self, html='', mark=False, url_id=0):
        self.html = html
        self.mark = mark
        self.id = url_id

In [3]:
def read_dataset(input_file_name):
    dataset = []
    with open(input_file_name, "r", encoding="utf-8") as input_file:
        headers = input_file.readline()
        for i, line in enumerate(input_file):
            parts = line.strip().split('\t')
            url_id = int(parts[0])
            mark = bool(int(parts[1]))
            #url = parts[2]
            pageInb64 = parts[3]
            dataset.append(DocItem(html=b64decode(pageInb64).decode("utf-8", errors="replace"), mark=mark, url_id=url_id))
    return dataset

In [4]:
def get_data(vectorizer, data):
    y = [document.mark for document in data]
    X = vectorizer.transform([document.html for document in data])
    return X, y

In [5]:
train = read_dataset('kaggle_train_data_tab.csv')
test = read_dataset('kaggle_test_data_tab.csv')
#vectorizer = CountVectorizer()
#vectorizer = TfidfVectorizer()
#vectorizer = TfidfVectorizer(ngram_range=(1,2), sublinear_tf = True)
vectorizer = TfidfVectorizer(ngram_range=(1,3), sublinear_tf = True)

In [6]:
%%time
vectorizer.fit([document.html for document in train])

CPU times: user 3min 26s, sys: 6.33 s, total: 3min 33s
Wall time: 3min 35s


TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True)

In [7]:
%%time
X_train, y_train = get_data(vectorizer, train)

CPU times: user 1min 53s, sys: 1.47 s, total: 1min 55s
Wall time: 2min


In [8]:
%%time
X_test, y_test = get_data(vectorizer, test)

CPU times: user 4min 27s, sys: 5.08 s, total: 4min 32s
Wall time: 4min 44s


In [9]:
X_test.shape

(16039, 19390202)

In [10]:
X_train.shape

(7044, 19390202)

In [11]:
clf = SGDClassifier()

In [12]:
print(' score = ', cross_val_score(clf, X_train, y_train, cv = 3, scoring='f1'))

 score =  [0.98452469 0.98496516 0.98604993]


In [13]:
%%time
clf.fit(X_train, y_train)

CPU times: user 5.75 s, sys: 106 ms, total: 5.85 s
Wall time: 5.53 s


SGDClassifier()

In [14]:
%%time
y_tst = clf.predict(X_test)

CPU times: user 688 ms, sys: 73 µs, total: 688 ms
Wall time: 671 ms


In [15]:
with open('my_submission.csv' , 'w') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Id','Prediction'])
    for doc, mark in zip(test, y_tst):
        writer.writerow([doc.id, int(mark)])