# Análisis del Sentimiento de las noticias económicas en Chile 2020

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import unidecode

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

En primer lugar se cargan los datos 

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/percepcioneseconomicas/publicaciones/main/sentiment_data/sentiment_data.csv')
data['response'][data['response'] == 2.0] = 0
data = data.dropna().reset_index(drop=True)
data = data[['response', 'texto']]
data.head()

Unnamed: 0,response,texto
0,1.0,"En cuanto a las colocaciones, estas alcanzaron..."
1,0.0,En el año 2009 el mundo enfrentó una crisis fi...
2,1.0,Es para mí motivo de gran satisfacción compart...
3,1.0,Si tuviera que resumir la gestión 2012 de Cruz...
4,1.0,El crecimiento anteriormente mencionado redund...


In [None]:
print('Dimensiones de los datos: \n', data.shape, '\n')
print(data.info())

In [None]:
# Rows en cada categoría de la variable y
data['response'].value_counts(normalize=True).round(2)

In [None]:
# Número de palabras
x = []
[x.append(len(e.split())) for e in data['texto']]
print('Número de palabras de la noticia más larga: \n',  max(x), '\n')
print('Número de palabras de la noticia más corta: \n',  min(x), '\n')

In [None]:
# Noticia aleatoria
k = np.random.randint(0, len(data['texto']))
print('Noticia %d:' % k, '\n', data['texto'][k])

# Preprocesamiento de los datos

In [None]:
# Lista de stopwords
sw = pd.read_csv('spanish.txt', header=None, names=['stopwords'])
stopwords = sw['stopwords'].tolist()

In [None]:
# Función para preprocesar los datos.
def preprocess(s):
    s = s.lower()
    s = re.sub('[0-9]+', '', s) 
    s = re.sub('[!"#$%&()*+,-./:;<=>¿?@[\\]^_`{|}~\t—’‘“”]', '', s)
    tokens = nltk.tokenize.word_tokenize(s) 
    tokens = [t for t in tokens if t not in stopwords] 
    tokens = [unidecode.unidecode(t) for t in tokens]
    jtokens = ' '.join(tokens)
    return jtokens

In [None]:
# Preprocesamiento
pdata = [preprocess(t) for t in data['texto']]

In [None]:
print('Noticia %d :' % k, '\n', pdata[k])

In [None]:
# Conteo de palabras por texto
vect = CountVectorizer(max_features=3000)
vdat = vect.fit_transform(pdata)
data1 = pd.DataFrame(vdat.toarray(), columns=vect.get_feature_names())
data1.head(1)

In [None]:
# Conteo de palabras y ngrams por texto
vect = CountVectorizer(max_features=3000, ngram_range=(1,2))
vdat = vect.fit_transform(pdata)
data2 = pd.DataFrame(vdat.toarray(), columns=vect.get_feature_names())
data2.head(1)

In [None]:
# Frecuencia de palabras por texto
data1sum = data1.sum(axis=1)
data3 = data1.divide(data1sum, axis=0)
data3.head(1)

In [None]:
# Frecuencia de palabras y ngrams por texto
data2sum = data2.sum(axis=1)
data4 = data2.divide(data2sum, axis=0)
data4.head(1)

In [None]:
# Tfidf de palabras por texto
vect = TfidfVectorizer(max_features=3000)
vdat = vect.fit_transform(pdata)
data5 = pd.DataFrame(vdat.toarray(), columns=vect.get_feature_names())
data5.head(1)

In [None]:
# Tfidf de palabras por texto
vect = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
vdat = vect.fit_transform(pdata)
data6 = pd.DataFrame(vdat.toarray(), columns=vect.get_feature_names())
data6.head(1)

# Separación de la muestra

In [None]:
# Se define la variable y se toman muestras
y = data['response']
X = data1

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.35, stratify=y, random_state=123)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.2, stratify=y_val, random_state=123)

In [None]:
samples = np.array([y_train.shape[0], y_val.shape[0], y_test.shape[0]])
print(samples)
print((samples/sum(samples)*100).round())


In [None]:
i_train = y_train.index
i_val = y_val.index
i_test = y_test.index

# Funciones y dataframes útiles

In [None]:
# Función para calcular las métricas de evaluación. 
def get_metrics(modelo, y, y_pred, y_pred_proba):
    return pd.DataFrame({
                'CV-Score': searcher.best_score_,
                'Accuracy': accuracy_score(y, y_pred),
                'AUC': roc_auc_score(y, y_pred_proba),
                'F1 Score': f1_score(y, y_pred)},
                index=[modelo])

In [None]:
def gen_X_train(X):
    X_train = X.loc[i_train]
    X_val = X.loc[i_val]
    X_test = X.loc[i_test]
    return X_train, X_val, X_test

In [None]:
def save_preds(pred_val, pred_test, modelo, y_pred_proba_val, y_pred_proba_test):
    pred_val[modelo] = y_pred_proba_val
    pred_test[modelo] = y_pred_proba_test

In [None]:
# DataFrames vacíos para almacenar los resultados 
results = pd.DataFrame()
parametros = pd.DataFrame()

In [None]:
# Diccionario con los datos y sus nombres
keys = ['data1', 'data2', 'data3', 'data4', 'data5', 'data6']
values = [data1, data2, data3, data4, data5, data6]
datos = dict(zip(keys, values))

# Modelos primera ronda

In [None]:
model = LogisticRegression(random_state=123)
parameters = {'C':np.logspace(-4, 4, 20), 
               'penalty':['l1', 'l2']}
searcher = GridSearchCV(estimator=model, 
                        param_grid=parameters, 
                        scoring='f1',
                        n_jobs=-1, 
                        verbose=1)

for key, value in datos.items():
    X_train, X_val, X_test = gen_X_train(value)
    
    searcher.fit(X_train, y_train)
    
    print(key, "Best CV params", searcher.best_params_)
    parametros = parametros.append(pd.DataFrame(searcher.best_params_, index=[key]))

    best_model = searcher.best_estimator_
    y_pred_train = best_model.predict(X_train)
    y_pred_proba_train = best_model.predict_proba(X_train)[:,1]
    results = results.append(get_metrics(key, y_train, y_pred_train, y_pred_proba_train))

# Resultados primera ronda

In [None]:
print('Best models:')
print(results.idxmax(), '\n')
results

In [None]:
parametros.to_csv('parametros_1.csv')
parametros

# Ajuste de los modelos a la muestra completa

In [None]:
# DataFrames vacíos para almacenar los resultados 
pred_val = pd.DataFrame()
pred_test = pd.DataFrame()

In [None]:
for key, value in datos.items():
    X_train, X_val, X_test = gen_X_train(value)

    model = LogisticRegression(random_state=123,
                                C=parametros.loc[key][0],
                                penalty=parametros.loc[key][1])

    model.fit(X_train, y_train)
    
    y_pred_proba_val = model.predict_proba(X_val)[:,1]
    y_pred_proba_test = model.predict_proba(X_test)[:,1]
    save_preds(pred_val, pred_test, key, y_pred_proba_val, y_pred_proba_test)

In [None]:
pred_val.index = i_val
pred_test.index = i_test

# Segunda ronda: Stacking

In [None]:
X_val = pred_val
X_test = pred_test

In [None]:
# DataFrames vacíos para almacenar los resultados 
parametros = pd.DataFrame()
results = pd.DataFrame()
pred_test = pd.DataFrame()

In [None]:
modelo = 'LR'
model = LogisticRegression(random_state=123)
parameters = {'C':np.logspace(-4, 4, 20), 
              'penalty':['l1', 'l2']}
searcher = GridSearchCV(estimator=model, 
                        param_grid=parameters, 
                        scoring='f1',
                        n_jobs=-1, 
                        verbose=1)
searcher.fit(X_val, y_val)
print("Best CV params", searcher.best_params_)
parametros = parametros.append(pd.DataFrame(searcher.best_params_, index=[key]))

best_model = searcher.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:,1]
pred_test[modelo] = y_pred_proba

results = results.append(get_metrics(modelo, y_test, y_pred, y_pred_proba))
results.round(4)

In [None]:
parametros.to_csv('parametros_2.csv')
parametros

# Ajuste del mejor modelo a la muestra completa

In [None]:
modelo = 'LR'
model = LogisticRegression(random_state=123,
                            C=parametros['C'].iloc[0],
                            penalty=parametros['penalty'].iloc[0])
model.fit(X_val, y_val)
y_pred_proba = model.predict_proba(X_test)[:,1]

# Optimizar threshold

In [None]:
# Función para transformar probabilidad en label
def to_labels(y_pred_proba, threshold):
	return (y_pred_proba >= threshold).astype('int')

In [None]:
# Diferentes thresholds a testear
thresholds = np.linspace(0, 1, 300)

In [None]:
# Optimización del threshold en base a F1 Score
scores = [f1_score(y_test, to_labels(y_pred_proba, t)) for t in thresholds]
ix = np.argmax(scores)
print(modelo, 'Threshold=%.4f, F-Score=%.4f' % (thresholds[ix], scores[ix]))

In [None]:
y_pred = (y_pred_proba >= thresholds[ix]).astype(int)
y_pred = pd.DataFrame(y_pred, columns=['y_pred'], index=y_test.index)

In [None]:
print('Accuracy=%.4f' % accuracy_score(y_test, y_pred).round(4))

In [None]:
pred = pd.concat([y_test, y_pred], axis=1)
pred['Accuracy'] = (pred['response']==pred['y_pred'])
pred