# Tutorial 3 - Práctica

- Modelos de lenguaje
- Word embedding
- Clasificación de textos (Bag-of-words/TFIDF vs. Word2Vec)

# 1. Datasets brutos

Tenemos a nuestra disposición datasets de dos medios:

- **CNN Chile** (https://www.cnnchile.com/): Es un medio de prensa ubicado en Chile. El dueño es *WarnerMedia News & Sports* (un conglomerado de multinacionales de Estados Unidos). 

- **Cadena SER (España)** (https://cadenaser.com/): Es un medio de prensa ubicado en España. Es dueño es el *Grupo Prisa* (un grupo de empresas de comunicación de España)


En su forma bruta, ambos datasets toman la forma de archivo CSV con la estructura siguiente:
- ID, country, media_outlet, url, title, body, date

El dataset **CNN Chile** contiene 16.472 noticias.

El dataset **Cadena SER** contiene 14.378 noticias.

# 2. Preparación del dataset CNN Chile

A partir del dataset CNN Chile bruto queremos extraer la categoría de la noticia a partir de la columna URL.

In [None]:
import pandas as pd

DATASET_CSV="datasets/CNNCHILE_RAW.csv"

df = pd.read_csv(DATASET_CSV,sep=',',error_bad_lines=False)
df = df.drop(['Unnamed: 0'], axis = 1) # Para suprimir la columna ID
df['date'] = pd.to_datetime(df['date']) # Para convertir la columna date en formato datetime

df

In [None]:
df['url'][0]

In [None]:
import re

url = df['url'][0]

obj = re.findall('(\w+)://([\w\-\.]+)/([\w\-]+).([\w\-]+)', url) 
obj

In [None]:
df['category'] = ''

In [None]:
for index, row in df.iterrows():
    url=row['url']
    obj = re.findall('(\w+)://([\w\-\.]+)/([\w\-]+).([\w\-]+)', url) 
    
    category=obj[0][2]
    
    df.loc[index,'category'] = category

- ¿Cuáles son las categorias del medio?

In [None]:
from pandasql import sqldf

q="""SELECT DISTINCT category FROM df;"""
result=sqldf(q)
result

- ¿Cuántas noticias hay por cada categoría?

In [None]:
q="""SELECT category, count(*) FROM df GROUP BY category ORDER BY count(*) DESC;"""
result=sqldf(q)
result

- Guardamos solamente las categorias que tienen más de 2000 noticias

In [None]:
q="""SELECT * FROM df WHERE category IN ('pais','deportes','tendencias','tecnologias','cultura','economia','mundo');"""
df_CNN=sqldf(q)
df_CNN

In [None]:
q="""SELECT * FROM df_CNN WHERE length(body)>5"""
df_CNN=sqldf(q)
df_CNN

# 3. Modelos de lenguaje: CNN Chile

In [None]:
import spacy

nlp = spacy.load("es_core_news_sm")

In [None]:
df_CNN_deportes = df_CNN[df_CNN['category']=='deportes']

sentences=[]

for index, row in df_CNN_deportes.iterrows():
    print(index)
    text=row['body']
    if (text is not None):
        doc=nlp(text)
        
        sentence=[]
        for token in doc:
            sentence.append(token.text)
        sentences.append(sentence)

In [None]:
len(sentences)

In [None]:
sentences[0]

In [None]:
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in sentences:
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
        
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [None]:
print(dict(model["la","persona"]))
print("\n")
print(dict(model["persona","más"]))
print("\n")
print(dict(model["más","importante"]))

In [None]:
import random

# starting words
text = ["durante","el"]
sentence_finished = False
 
while not sentence_finished:
  # select a random probability threshold  
  r = random.random()
  accumulator = .0

  for word in model[tuple(text[-2:])].keys():
      accumulator += model[tuple(text[-2:])][word]
      # select words that are above the probability threshold
      if accumulator >= r:
          text.append(word)
          break

  if text[-2:] == [None, None]:
      sentence_finished = True
 
print (' '.join([t for t in text if t]))

# 4. Word Embedding: CNN Chile

In [None]:
from gensim.models import word2vec

In [None]:
#training word2vec

model2 = word2vec.Word2Vec(sentences,size=200,hs=1)

In [None]:
model2.wv.similarity('hombre','mujer')

In [None]:
model2.wv.similarity('hombre','deporte')

In [None]:
model2.wv.similarity('mujer','deporte')

In [None]:
model2.wv.most_similar(positive=['mujer'],topn=10)

In [None]:
model2.wv.most_similar(positive=['hombre'],topn=10)

In [None]:
model2.wv.most_similar(positive=['mujer','deporte'],topn=10)

In [None]:
model2.wv.most_similar(positive=['hombre','deporte'],topn=10)

In [None]:
model2.wv.most_similar(positive=['hombre','deporte'], negative=['mujer'],topn=10)

In [None]:
model2.wv.most_similar(positive=['mujer','deporte'], negative=['hombre'],topn=10)

In [None]:
df_CNN_pais = df_CNN[df_CNN['category']=='pais']

sentences_pais=[]

for index, row in df_CNN_pais.iterrows():
    print(index)
    text=row['body']
    if (text is not None):
        doc=nlp(text)
        
        sentence=[]
        for token in doc:
            sentence.append(token.text)
        sentences_pais.append(sentence)

In [None]:
#training word2vec

model3 = word2vec.Word2Vec(sentences_pais,size=200,hs=1)

In [None]:
model3.wv.similarity('hombre','mujer')

In [None]:
model3.wv.similarity('hombre','deporte')

In [None]:
model3.wv.similarity('mujer','deporte')

In [None]:
model3.wv.most_similar(positive=['mujer'],topn=10)

In [None]:
model3.wv.most_similar(positive=['hombre'],topn=10)

In [None]:
model3.wv.most_similar(positive=['violencia'],topn=10)

In [None]:
model3.wv.similarity('violencia','policía')

In [None]:
model3.wv.similarity('violencia','mujer')

In [None]:
model3.wv.similarity('violencia','hombre')

In [None]:
model3.wv.similarity('violencia','policial')

In [None]:
model3.wv.most_similar(positive=['violencia','policía'],topn=10)

In [None]:
model3.wv.most_similar(positive=['violencia','manifestante'],topn=20)

In [None]:
model3.wv.most_similar(positive=['violencia','manifestación'],topn=30)

In [None]:
model3.n_similarity(['violencia','manifestación'],['manifestantes'])

In [None]:
model3.n_similarity(['violencia','manifestación'],['policía'])

- Para comparar con un modelo Word2Vec genérico: https://github.com/dccuchile/spanish-word-embeddings

In [None]:
from gensim.models.keyedvectors import KeyedVectors

wordvectors_file_vec = 'fasttext-sbwc.3.6.e20.vec'
cantidad = 20000
wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file_vec, limit=cantidad)

In [None]:
wordvectors.wv.similarity('hombre','mujer')

In [None]:
wordvectors.wv.similarity('violencia','policía')

In [None]:
print("GENERAL:")
print(wordvectors.wv.similarity('mujer','poder'))
print(wordvectors.wv.similarity('hombre','poder'))

# 5. Clasificación de textos (bag-of-words vs. Doc2Vec)

## 5.1 Bag-of-word: Count y TF-IDF

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [None]:
import spacy
import string
from spacy.lang.es.stop_words import STOP_WORDS
from spacy.lang.es import Spanish

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
stop_words=""

# Load Spanish tokenizer, tagger, parser, NER and word vectors
parser = Spanish()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [None]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
bow_vector

In [None]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [None]:
from sklearn.model_selection import train_test_split

X = df_CNN['body'] # the features we want to analyze
ylabels = df_CNN['category'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)


In [None]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression

modelLR1 = LogisticRegression()
modelLR2 = LogisticRegression()

# Create pipeline using Bag of Words
pipe1 = Pipeline([('preprocessing', bow_vector),
                 ('regression-ML', modelLR1)])

# Create pipeline using Bag of Words + TFIDF
pipe2 = Pipeline([('preprocessing', tfidf_vector),
                 ('regression-ML', modelLR2)])

# model generation
pipe1.fit(X_train,y_train)
print("modelo #1")
pipe2.fit(X_train,y_train)
print("modelo #2")

In [None]:
from sklearn import metrics
# Predicting with a test dataset
predicted1 = pipe1.predict(X_test)
predicted2 = pipe2.predict(X_test)

In [None]:
#Evaluación del rendimiento del clasificador
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, predicted1)
print(confusion_matrix)
#Print de la matriz de confusión
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted1))

In [None]:
#Evaluación del rendimiento del clasificador
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, predicted2)
print(confusion_matrix)
#Print de la matriz de confusión
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted2))

## 5.2 Doc2Vec (extensión de Word2Vec para representar documentos)

Artículo de Doc2Vec (2014): https://cs.stanford.edu/~quocle/paragraph_vector.pdf

In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import utils
import csv
from tqdm import tqdm
import multiprocessing

import nltk
from nltk.corpus import stopwords

In [None]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [None]:
from sklearn.model_selection import train_test_split

X = df_CNN['body'] # the features we want to analyze
ylabels = df_CNN['category'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [None]:
X_test.shape
#y_train[9]

In [None]:
tags_index = {'cultura': 1 , 'deportes': 2, 'economia': 3, 'mundo': 4, 'pais': 5, 'tecnologias': 6, 'tendencias':7}

In [None]:
train_documents=[]
test_documents=[]

tags_index = {'cultura': 1 , 'deportes': 2, 'economia': 3, 'mundo': 4, 'pais': 5, 'tecnologias': 6, 'tendencias':7}

for i in range(0,11065):
    index=X_train.index[i]
    text = X_train[index]
    tag = y_train[index]
    
    train_documents.append(TaggedDocument(words=tokenize_text(text), tags=[tags_index.get(tag)] ))
    
for i in range(0,4742):
    index=X_test.index[i]
    text = X_test[index]
    tag = y_test[index]
    
    test_documents.append(TaggedDocument(words=tokenize_text(text), tags=[tags_index.get(tag)] ))

In [None]:
cores = multiprocessing.cpu_count()

model_dbow = Doc2Vec(dm=1, vector_size=200, negative=5, hs=0, min_count=2, sample = 0, workers=cores, alpha=0.025, min_alpha=0.001)

model_dbow.build_vocab([x for x in tqdm(train_documents)])

train_documents  = utils.shuffle(train_documents)

model_dbow.train(train_documents,total_examples=len(train_documents), epochs=30)

def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, feature_vectors

model_dbow.save('./Doc2Vec_model.d2v')

In [None]:
y_train, X_train = vector_for_learning(model_dbow, train_documents)
y_test, X_test = vector_for_learning(model_dbow, test_documents)

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)

In [None]:
predicted3 = logreg.predict(X_test)

In [None]:
#Evaluación del rendimiento del clasificador
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, predicted3)
print(confusion_matrix)
#Print de la matriz de confusión
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted3))

# 6. Ideas para prácticar

- #1: Preparar el dataset Cadena SER
- #2: Comparar los embeddings: CNN Chile vs. Cadena SER vs. General (--> ¿Cómo se podría sistematizar?)
- #3: Visualizar la evolución de los embeddings en el tiempo
- #4: Optimizar el modelo de clasificación por temática (otros algoritmos, mejores preprocesamientos, etc.)
- ...