In [1]:
import re 
from nltk.corpus import stopwords
import codecs
from pathlib import Path

DATA_PATH = Path('data/')
stopwords_set = set(stopwords.words('english'))

def text_prepare(text):
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))
    
    text = replace_by_space_re.sub(' ', text.lower())
    text = bad_symbols_re.sub('', text)
    text = ' '.join([token for token in text.split() if token and token not in stopwords_set])
    return text.strip()

def prepare_file(in_, out_):
    out = open(out_, 'w')
    for line in open(in_, encoding='utf8'):
        line = line.strip().split('\t')
        new_line = [text_prepare(q) for q in line]
        print(*new_line, sep='\t', file=out)
    out.close()

In [2]:
prepare_file(DATA_PATH/'train.tsv', DATA_PATH/'starspace_train.tsv')

In [2]:
%%bash
starspace train -trainFile 'data/starspace_train.tsv' -model starspace_embedding \
-trainMode 3 \
-adagrad true \
-ngrams 1 \
-epoch 5 \
-dim 100 \
-similarity "cosine" \
-minCount 2 \
-verbose true \
-fileFormat labelDoc \
-negSearchLimit 10 \
-lr 0.05 \
-thread 4

Process is interrupted.


In [4]:
# 100 dimension
!starspace train -ngrams 1 -minCount 2 -fileFormat labelDoc -thread 4 -trainFile data/starspace_train100.tsv -model models/starspace_embedding -initRandSd 0.01 -adagrad true -lr 0.05 -epoch 5 -dim 100 -negSearchLimit 10 -trainMode 3 -similarity "cosine" -verbose true

Arguments: 
lr: 0.05
dim: 100
epoch: 5
maxTrainTime: 8640000
saveEveryEpoch: 0
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 10
thread: 4
minCount: 2
minCountLabel: 1
label: __label__
ngrams: 1
bucket: 2000000
adagrad: 1
trainMode: 3
fileFormat: labelDoc
normalizeText: 0
dropoutLHS: 0
dropoutRHS: 0
Start to initialize starspace model.
Build dict from input file : data/starspace_train.tsv
Read 12M words
Number of words in dictionary:  95058
Number of labels in dictionary: 0
Loading data from file : data/starspace_train.tsv
Total number of examples loaded : 999740
Initialized model weights. Model size :
matrix : 95058 100
Training epoch 0: 0.05 0.01
Epoch: 100.0%  lr: 0.040020  loss: 0.008152  eta: 0h6m  tot: 0h1m37s  (20.0%)  tot: 0h0m4s  (0.9%)0.017044  eta: 0h8m  tot: 0h0m11s  (2.3%)0h8m  tot: 0h0m20s  (4.1%)0h7m  tot: 0h0m25s  (5.0%)0.012744  eta: 0h7m  tot: 0h0m27s  (5.4%)0.012498  eta: 0h7m  tot: 0h0m28s  (5.7%)0h7m  tot: 0h0m30s  (6.2%)0h7m  tot: 0h

In [7]:
# 300 dimension
!starspace train -ngrams 1 -minCount 2 -fileFormat labelDoc -thread 4 -trainFile data/starspace_train.tsv -model models/starspace_embedding300 -initRandSd 0.01 -adagrad true -lr 0.05 -epoch 5 -dim 300 -negSearchLimit 10 -trainMode 3 -similarity "cosine" -verbose true


Arguments: 
lr: 0.05
dim: 300
epoch: 5
maxTrainTime: 8640000
saveEveryEpoch: 0
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 10
thread: 4
minCount: 2
minCountLabel: 1
label: __label__
ngrams: 1
bucket: 2000000
adagrad: 1
trainMode: 3
fileFormat: labelDoc
normalizeText: 0
dropoutLHS: 0
dropoutRHS: 0
Start to initialize starspace model.
Build dict from input file : data/starspace_train.tsv
Read 12M words
Number of words in dictionary:  95058
Number of labels in dictionary: 0
Loading data from file : data/starspace_train.tsv
Total number of examples loaded : 999740
Initialized model weights. Model size :
matrix : 95058 300
Training epoch 0: 0.05 0.01
Epoch: 100.0%  lr: 0.040030  loss: 0.007539  eta: 0h19m  tot: 0h4m47s  (20.0%)m  tot: 0h0m9s  (0.6%)0.019305  eta: 0h25m  tot: 0h0m18s  (1.2%)0h25m  tot: 0h0m21s  (1.4%)  tot: 0h0m29s  (2.0%)0.015431  eta: 0h24m  tot: 0h0m36s  (2.4%)0h23m  tot: 0h1m8s  (4.6%)0h23m  tot: 0h1m9s  (4.7%)0.011746  eta: 0h23m  tot: 

In [1]:
# 300 dimension ngram 2
!starspace train -ngrams 2 -minCount 2 -fileFormat labelDoc -thread 4 -trainFile data/starspace_train.tsv -model models/starspace_embedding300_ngram2 -initRandSd 0.01 -adagrad true -lr 0.05 -epoch 8 -dim 300 -negSearchLimit 10 -trainMode 3 -similarity "cosine" -verbose true


Arguments: 
lr: 0.05
dim: 300
epoch: 8
maxTrainTime: 8640000
saveEveryEpoch: 0
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 10
thread: 4
minCount: 2
minCountLabel: 1
label: __label__
ngrams: 2
bucket: 2000000
adagrad: 1
trainMode: 3
fileFormat: labelDoc
normalizeText: 0
dropoutLHS: 0
dropoutRHS: 0
Start to initialize starspace model.
Build dict from input file : data/starspace_train.tsv
Read 12M words
Number of words in dictionary:  95058
Number of labels in dictionary: 0
Loading data from file : data/starspace_train.tsv
Total number of examples loaded : 999831
Initialized model weights. Model size :
matrix : 2095058 300
Training epoch 0: 0.05 0.00625
Epoch: 100.0%  lr: 0.043751  loss: 0.008665  eta: 0h53m  tot: 0h7m35s  (12.5%)9405  eta: 1h4m  tot: 0h0m35s  (0.9%)0h59m  tot: 0h2m35s  (4.2%)0.011943  eta: 0h59m  tot: 0h2m47s  (4.5%)0h59m  tot: 0h2m59s  (4.8%)h58m  tot: 0h3m31s  (5.7%)  eta: 0h58m  tot: 0h3m36s  (5.8%)0h58m  tot: 0h3m45s  (6.1%)0.045690 

In [4]:
!head data/starspace_train.tsv

converting string list	convert google results object pure js python object
html 5 canvas javascript use making interactive drawing tool	event handling geometries threejs
sending array via ajax fails	getting list items unordered list php
insert cookiecollection cookiecontainer	c# create cookie string send
updating one element bound observable collection	wpf update changes list item list
mongodb error find	retrieve queried element object array mongodb collection
select2 displaying search results	use jquery ajax outside domain
using reduce merge multiple data frames passing arguments without defining function outside reduce syntax	r merge list data frames one data frame missing values row
adding prototype javascript object literal	javascript prototype work	javascript setting property undefined prototyped object
whats best way get directory assembly executing	dependency inversion principle important	dependency inversion compile time configured dependency injection aspnet mvc 4 sol

In [2]:
# 100 dimension ngram 2
!starspace train -ngrams 2 -minCount 2 -fileFormat labelDoc -thread 4 -trainFile data/starspace_train.tsv -model models/starspace_embedding100_ngram2 -initRandSd 0.01 -adagrad true -lr 0.05 -epoch 8 -dim 100 -negSearchLimit 10 -trainMode 3 -similarity "cosine" -verbose true


Arguments: 
lr: 0.05
dim: 100
epoch: 8
maxTrainTime: 8640000
saveEveryEpoch: 0
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 10
thread: 4
minCount: 2
minCountLabel: 1
label: __label__
ngrams: 2
bucket: 2000000
adagrad: 1
trainMode: 3
fileFormat: labelDoc
normalizeText: 0
dropoutLHS: 0
dropoutRHS: 0
Start to initialize starspace model.
Build dict from input file : data/starspace_train.tsv
Read 12M words
Number of words in dictionary:  95058
Number of labels in dictionary: 0
Loading data from file : data/starspace_train.tsv
Total number of examples loaded : 999831
Initialized model weights. Model size :
matrix : 2095058 100
Training epoch 0: 0.05 0.00625
Epoch: 100.0%  lr: 0.043764  loss: 0.009110  eta: 0h18m  tot: 0h2m36s  (12.5%)0s  (0.0%)0h21m  tot: 0h0m35s  (2.7%)0h20m  tot: 0h0m42s  (3.3%)0h20m  tot: 0h0m47s  (3.7%)m  tot: 0h1m1s  (4.8%)0h20m  tot: 0h1m10s  (5.5%)0h19m  tot: 0h1m23s  (6.6%)  eta: 0h19m  tot: 0h1m48s  (8.6%)0h19m  tot: 0h1m55s  (9.2%)0

Should I train on my own data????? 
- TrainMode - 5