# Classification with word embedding techniques

In [1]:
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm.notebook import tqdm
import gensim
import nltk
import numpy as np
import os
import pandas as pd

## TF-IDF

### Load data

In [2]:
filenames = os.listdir(os.path.join('data', 'cbr-ilp-ir-son-int'))
corpus = []
for filename in filenames:
    with open(os.path.join('data', 'cbr-ilp-ir-son-int', filename), 'r', encoding='latin-1') as f:
        corpus.append(f.read())

### The data

In [3]:
corpus[0][:500]

"A cross-modal electronic travel aid device           \n\nF. Fontana, A. Fusiello, M. Gobbi, V. Murino,                       \nD. Rocchesso, L. Sartor, A. Panuccio                           \nDipartimen to di Informatica, University of Verona                  \nCa' Vignal 2, Strada Le Grazie 15, 37134 Verona, Italy                 \n{fontana,fusiello,m urino,rocchesso,panucciog}@sci.univr.it             \n\n\n\nAbstract.   \nThis paper describes the design of an Electronic Travel Aid        \ndevice, that w"

In [4]:
classes = [(filename.split('-')[0]).split('_')[0] for filename in filenames]
which_class = dict(zip(set(classes), range(5)))
y = np.array([which_class[name] for name in classes])

### Vectorize it

In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
X = X.todense()
X = np.array(X)

### Split data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=314)

### Train model

In [8]:
%%time
scale = preprocessing.StandardScaler().fit(X_train)
lr = LogisticRegressionCV(cv=5, random_state=314).fit(X_train, y_train)

CPU times: user 2min 36s, sys: 1.13 s, total: 2min 37s
Wall time: 59.7 s


### Evaluate

In [9]:
# Training data
accuracy_score(y_train, lr.predict(X_train))

1.0

In [10]:
# Test data
accuracy_score(y_test, lr.predict(X_test))

0.9781021897810219

## word2vec

### Vectorize it

In [11]:
tokenized_corpus = [word_tokenize(doc) for doc in corpus]

In [12]:
%%time
model = gensim.models.Word2Vec(tokenized_corpus)

CPU times: user 6.17 s, sys: 20.5 ms, total: 6.19 s
Wall time: 3.15 s


In [13]:
# Curiosity
model.wv.most_similar('novel')

[('provides', 0.9918190240859985),
 ('sketch', 0.9878435134887695),
 ('goal', 0.987826943397522),
 ('bootstrapping', 0.9877110719680786),
 ('question', 0.9874297380447388),
 ('good', 0.9862766265869141),
 ('give', 0.9858249425888062),
 ('evaluate', 0.9857773780822754),
 ('questions', 0.9849929809570312),
 ('technique', 0.9849549531936646)]

In [14]:
w2v = dict(zip(model.wv.index2word, model.wv.vectors))

In [15]:
vectorized_corpus = []
for doc in tqdm(tokenized_corpus):
    vectorized_doc = []
    for word in doc:
        if word in model.wv.index2word:
            vectorized_doc.append(w2v[word])
    vectorized_corpus.append(np.sum(vectorized_doc, axis=0))

HBox(children=(FloatProgress(value=0.0, max=681.0), HTML(value='')))




In [16]:
X = np.array(vectorized_corpus)

### Split data

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=314)

### Train model

In [18]:
%%time
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
lr = LogisticRegressionCV(cv=5, random_state=314, max_iter=10000).fit(X_train, y_train)

CPU times: user 25.5 s, sys: 348 ms, total: 25.9 s
Wall time: 6.71 s


### Evaluate

In [19]:
# Training data
accuracy_score(y_train, lr.predict(X_train))

0.9779411764705882

In [20]:
# Test data
accuracy_score(y_test, lr.predict(X_test))

0.9051094890510949