# Estimacion de categorias - Pagina12

In [1]:
# Instalacion de Bibliotecas
!pip install gdown --quiet

In [2]:
# Importo bibliotecas
import gdown
import sys
import warnings
import pprint
import gc
import re
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Descargo punkt
nltk.download('punkt')

# PrettyPrinter
pp = pprint.PrettyPrinter(indent=4, compact=True)

# Libero memoria
gc.collect()

# No mostrar warnings
warnings.filterwarnings('ignore')

# Limite de recursion
sys.setrecursionlimit(30000)

# Tema de graficos
sns.set_theme(style="whitegrid")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# Descargo datasets de secciones de noticias
gdown.download('https://drive.google.com/uc?id=1qLM1mV45A9-hUI9dWb4SwtqCmfmAA35R', 'Sociedad.sav', quiet=False)
gdown.download('https://drive.google.com/uc?id=19KLD-8nRba8XWwcw6YjChwB4KSZ10g6v', 'Economia.sav', quiet=False)
gdown.download('https://drive.google.com/uc?id=1juCpm4wAXHFXH3IBVQ3FrkfQ7I8r9BLn', 'ElMundo.sav', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1qLM1mV45A9-hUI9dWb4SwtqCmfmAA35R
To: /content/Sociedad.sav
824MB [00:05, 143MB/s]
Downloading...
From: https://drive.google.com/uc?id=19KLD-8nRba8XWwcw6YjChwB4KSZ10g6v
To: /content/Economia.sav
839MB [00:23, 36.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1juCpm4wAXHFXH3IBVQ3FrkfQ7I8r9BLn
To: /content/ElMundo.sav
814MB [00:16, 50.9MB/s]


'ElMundo.sav'

In [4]:
# Cargo secciones (Aproximadamente toma 5 minutos en cargar las tres secciones)
society_section = joblib.load('Sociedad.sav')
economy_section = joblib.load('Economia.sav')
world_section = joblib.load('ElMundo.sav')

In [5]:
# Concatenamos secciones
news = pd.concat([society_section, economy_section, world_section], axis=0)

In [6]:
# Borramos temporal de secciones
del society_section
del economy_section
del world_section

# Libero memoria
gc.collect()

6804536

In [7]:
news.head()

Unnamed: 0,url,paragraph,date,topic
0,https://www.pagina12.com.ar//370534-fue-a-cort...,"""Sin embargo, para sorpresa de la demandante (...",24 de septiembre de 2021 - 12:24,sociedad
1,https://www.pagina12.com.ar//370530-lanzaron-u...,“El Plan fue presentado en varias jurisdiccion...,24 de septiembre de 2021 - 12:04,sociedad
2,https://www.pagina12.com.ar//370527-orden-de-a...,El FBI pidió que cualquier persona que tenga i...,24 de septiembre de 2021 - 11:56,sociedad
3,https://www.pagina12.com.ar//370567-julio-y-gi...,"""Esta muestra es un tributo con gran cariño de...",24 de septiembre de 2021 - 15:20,sociedad
4,https://www.pagina12.com.ar//370349-yemen-un-g...,La investigación se abocó al primer descenso d...,24 de septiembre de 2021 - 01:34,sociedad


In [8]:
news.sort_values(by='date', inplace=True)
news.head()

Unnamed: 0,url,paragraph,date,topic
938,https://www.pagina12.com.ar//184479-el-comedia...,Ucrania irá a segunda vuelta. El comediante uc...,1 de abril de 2019 - 01:28,el-mundo
943,https://www.pagina12.com.ar//184346-la-ultrade...,PáginaI12 en FranciaDesde ParísYa no son un gr...,1 de abril de 2019 - 01:40,el-mundo
940,https://www.pagina12.com.ar//184481-hacer-memo...,Cientos de brasileños salieron ayer a las call...,1 de abril de 2019 - 10:42,el-mundo
942,https://www.pagina12.com.ar//184483-una-presid...,PáginaI12 en PerúDesde LimaLlegó de improviso ...,1 de abril de 2019 - 11:30,el-mundo
936,https://www.pagina12.com.ar//184586-tragedia-e...,Al menos veinte personas murieron en una termi...,1 de abril de 2019 - 14:47,el-mundo


In [9]:
# Stopwords
stopwords_es = pd.read_csv('https://drive.google.com/uc?export=download&id=1prdR9zCvSQnEIZoLf5B0TiS1tUhMLClC', header = None)
stopwords_es_sin_acentos = pd.read_csv('https://drive.google.com/uc?export=download&id=1QDLZXPDnJ1XbHJRukDgqxi5P0AKtssd_', header = None)

stopwords = pd.concat([stopwords_es, stopwords_es_sin_acentos])

In [10]:
def remove_stop_words(text):
  """
    Remueve stop words en inglés

    Attributes
    ----------
    text: list
      lista de palabras (tokens) a filtrar

    Returns
    -------
    list
      lista de palabras sin los stop words
  """
  return [token for token in text if token.lower() not in stopwords]

In [11]:
def stem_words(tokens):
    """
    Transforma mediante un stemmer a una secuencia de tokens.
    :param tokens: Una secuencia de tokens.
    :return La secuencia de tokens transformada por el stemmer.
    """
    stemmer = SnowballStemmer("spanish")
    return [stemmer.stem(word) for word in tokens]

In [12]:
def clean_short_words(text):
  """
    Limpia palabras con longitud 1

    Attributes
    ----------
    text: str
      documento a tokenizar
    
    Returns
    -------
    list
      lista de tokens
  """
  return [word for word in text if len(word) > 1]

In [13]:
def preprocess_text(text):
  """
    Pre-procesamiento

    Attributes
    ----------
    text: str
      documento a analizar

    Returns
    -------
    pd.DataFrame
      retorna un dataframe con las 20 palabras que más se repiten y su frecuencia
  """
  tokenized = word_tokenize(text, language='spanish')
  without_stops = remove_stop_words(tokenized)
  without_short_words = clean_short_words(without_stops)
  stemmed_words = stem_words(without_short_words)
  return stemmed_words

In [14]:
def create_train_test_directories(X_train, X_test, y_train, y_test):
  train_dataset = pd.concat([X_train, y_train], axis=1)
  test_dataset = pd.concat([X_test, y_test], axis=1)
  
  train_dataset[train_dataset.topic == 'sociedad'].to_csv('Sociedad_Training.csv', sep='\t', encoding='utf-8')
  test_dataset[test_dataset.topic == 'sociedad'].to_csv('Sociedad_Testing.csv', sep='\t', encoding='utf-8')

  train_dataset[train_dataset.topic == 'el-mundo'].to_csv('ElMundo_Training.csv', sep='\t', encoding='utf-8')
  test_dataset[test_dataset.topic == 'el-mundo'].to_csv('ElMundo_Testing.csv', sep='\t', encoding='utf-8')

  train_dataset[train_dataset.topic == 'economia'].to_csv('Economia_Training.csv', sep='\t', encoding='utf-8')
  test_dataset[test_dataset.topic == 'economia'].to_csv('Economia_Testing.csv', sep='\t', encoding='utf-8')

In [15]:
# Separacion en entrenamiento y validacion
# X_train, X_test, y_train, y_test = train_test_split(news.loc[:, news.columns != 'topic'], news.topic, test_size=.10, random_state=10)
X_train, X_test, y_train, y_test = train_test_split(news.loc[:, news.columns != 'topic'], news.topic, test_size=.10, random_state=10, shuffle=False)

# Guardamos conjuntos de entrenamiento y validacion en directorios para la entrega
create_train_test_directories(X_train, X_test, y_train, y_test)

# Convertimos a unicode
X_train = X_train.paragraph.values.astype('U')
X_test = X_test.paragraph.values.astype('U')

# Liberamos memoria
del news
gc.collect()

100

In [16]:
# Cantidad minima de docs que tienen que tener a un token para conservarlo.
MIN_DF=3
# Cantidad maxima de docs que tienen que tener a un token para conservarlo.
MAX_DF=0.8
# Numero minimo tokens consecutivos que se consideran
MIN_NGRAMS=1
# Numero maximo tokens consecutivos que se consideran
MAX_NGRAMS=2

tfidf_vectorizer = TfidfVectorizer(tokenizer=preprocess_text, lowercase=True, strip_accents='unicode', decode_error='ignore', 
                                   ngram_range=(MIN_NGRAMS, MAX_NGRAMS), min_df=MIN_DF, max_df=MAX_DF)

In [17]:
class DenseTransformer():
    
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [18]:
# Creo Folds estratificados
stratified_kfolds = StratifiedKFold(5, random_state=10)

# Defino estimadores con los que entrenar
estimators = [
              ('dt', DecisionTreeClassifier()),
              ('nb', GaussianNB()),
              ('svc', LinearSVC())
]

# Defino hiperparametros que voy a optimizar para los diferentes modelos
estimators_hyperparams = {
    'dt__max_depth': list(range(2,16)), 'dt__min_samples_split': list(range(2,16)),
    'nb__var_smoothing': np.logspace(0,-9, num=50)                                                                                                                     
}

# GridSearch con cross-validation y un stacking de estimadores previamente definidos
model_cv = GridSearchCV(estimator = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()), 
                        param_grid = estimators_hyperparams, 
                        scoring = 'roc_auc_ovo_weighted', 
                        cv = stratified_kfolds)

In [19]:
ml_pipeline = Pipeline(steps=[
                              ('preprocessor', tfidf_vectorizer),
                              ('to_dense', DenseTransformer()), 
                              ('grid_search_cv', model_cv)
])

In [20]:
# Liberar memoria RAM antes de entrenar
gc.collect()

253

In [None]:
ml_pipeline.fit(X_train, y_train)

In [None]:
y_pred = ml_pipeline.predict(X_test)

In [None]:
pp.pprint(y_pred)

In [None]:
pp.pprint(y_test)

In [None]:
# Matriz de confusion
labels = ["sociedad", "el-mundo", "economia"]
cnf_matrix = confusion_matrix(y_test, y_pred, labels=labels)

In [None]:
df_cm = pd.DataFrame(cnf_array, index = [i for i in labels],
                  columns = [i for i in labels])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True)

In [None]:
# Exactitud
accuracy_score(y_test, y_pred)

In [None]:
# Precision
precision_score(y_test, y_pred, average='weighted')

In [None]:
# Recall
recall_score(y_test, y_pred, average='weighted')