# Classification with word embedding techniques

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
import numpy as np
import os
import pandas as pd

## TF-IDF

### Load data

In [2]:
filenames = os.listdir(os.path.join('data', 'cbr-ilp-ir-son-int'))
corpus = []
for filename in filenames:
    with open(os.path.join('data', 'cbr-ilp-ir-son-int', filename), 'r', encoding='latin-1') as f:
        corpus.append(f.read())

### The data

In [3]:
corpus[0][:500]

"A cross-modal electronic travel aid device           \n\nF. Fontana, A. Fusiello, M. Gobbi, V. Murino,                       \nD. Rocchesso, L. Sartor, A. Panuccio                           \nDipartimen to di Informatica, University of Verona                  \nCa' Vignal 2, Strada Le Grazie 15, 37134 Verona, Italy                 \n{fontana,fusiello,m urino,rocchesso,panucciog}@sci.univr.it             \n\n\n\nAbstract.   \nThis paper describes the design of an Electronic Travel Aid        \ndevice, that w"

### Vectorize it

In [4]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
X = X.todense()
X = np.array(X)

In [5]:
classes = [(filename.split('-')[0]).split('_')[0] for filename in filenames]
which_class = dict(zip(set(classes), range(5)))
y = np.array([which_class[name] for name in classes])

### Split data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=314)

### Train model

In [7]:
lr = LogisticRegressionCV(cv=5, random_state=314).fit(X_train, y_train)

### Evaluate

In [8]:
# Training
(lr.predict(X_train) == y_train).sum()/len(y_train)

1.0

In [9]:
# Test
(lr.predict(X_test) == y_test).sum()/len(y_test)

0.9781021897810219