In [None]:
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
import gensim
from gensim.models import Word2Vec
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import string
from sklearn.utils import shuffle
import multiprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

In [None]:
df = pd.read_csv('uci-news-aggregator.csv')
df = df[['TITLE','CATEGORY']]
#categories: b = business, t = science and technology, e = entertainment, m = health

In [None]:
# shuffle - embaralha as linhas para evitar problema (aprende apenas amostras de uma categoria e isso pode levar o gradiente a
# ficar preso num mínimo local e só aprender bem sobre)
df = shuffle(df)
df = df.reset_index(drop = True)
df.head()

In [None]:
def TreatText(data):
    #import pdb; pdb.set_trace()
    stops = set(stopwords.words("english"))  # melhora a performance convertendo num set
    data['TITLE'] = [re.sub("[^a-zA-Z]", " ",data['TITLE'][i]) for i in range(len(data))] #mantém apenas letras (há números, links, etc.)     
    data['TITLE'] = [word_tokenize(data['TITLE'][i].lower()) for i in range(len(data))] # caixa baixa
    data['TITLE'] = [[w for w in data['TITLE'][i] if w not in stops]for i in range(len(data))]# remove stop words
    return(data)

In [None]:
df = TreatText(df)

In [None]:
labels = np.array(df['CATEGORY']) # label para cada uma das frases

In [None]:
# parâmetros do word2vec
dim_vec = 300
min_count = 10
window = 4
num_workers = multiprocessing.cpu_count()
seed = tf.set_random_seed(42)

In [None]:
# instância do Word2Vec
modelo = Word2Vec(df["TITLE"],
                    min_count = min_count, 
                    size = dim_vec, 
                    window = window,
                    seed = seed,
                    workers = num_workers,
                    sg = 1) #sg = 0 -> CBOW e sg = 1 -> skipgram

In [None]:
print("Tamanho do vocabulário do Word2Vec: ", len(modelo.wv.vocab.keys()))

In [None]:
# exemplos das relações semânticas que o word2vec consegue estabelecer
print(modelo.most_similar('samsung')) # palavra mais similar a 'itau'
print(modelo.similarity('google', 'microsoft')) # similaridade entre duas palavras
print(modelo.most_similar(positive = ['show', 'movie'], negative = ['home'], topn = 3)) # similaridade considerando exemplos positivos e negativos

In [None]:
def meanVector(model,phrase):
    vocab = model.wv.vocab
    phrase = " ".join(phrase)
    phrase = [x for x in word_tokenize(phrase) if x in vocab]
    #Quando não houver palavra o vector recebe 0 para todas as posições
    if phrase == []:
        vetor = [0.0]*dim_vec 
    else: 
        #Caso contrário, calculando a matriz da frase
        vetor = np.mean([model[word] for word in phrase],axis=0)
    return vetor

In [None]:
def createFeatures(base): 
    features = [meanVector(modelo,base['TITLE'][i])for i in range(len(base))]
    return features

In [None]:
df = createFeatures(df)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[0:100000], labels[0:100000], test_size=0.3,random_state=109)
clf = svm.SVC(kernel='linear') 
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))