<a href="https://colab.research.google.com/github/paucaroscanoa/ApiBookAuthor/blob/master/Caso_de_estudio_1_4_Agrupamiento_espectral_de_noticias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Caso de estudio 1.4: Agrupamiento espectral - Agrupación de noticias

In [None]:
!pip3 install git+https://github.com/mit-nlp/MITIE.git
!wget https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2
!tar jxf MITIE-models-v0.2.tar.bz2

In [None]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import csv

#ML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import cluster

#Bibliotecas de web scraping
from bs4 import BeautifulSoup

#NLP

from mitie import *
print('Bibliotecas importadas com sucesso!\n')
print("Carregando o modelo NER...")
ner = named_entity_extractor('MITIE-models/english/ner_model.dat')
print("\nEtiquetas de saída do modelo NER:", ner.get_possible_ner_tags())

# Genera la base de datos (Web Scraping)

In [None]:
UK_news_url = 'https://www.theguardian.com/uk'
#UK_news_url = 'https://web.archive.org/web/20230319001320/http://www.theguardian.com/uk'
#UK_news_url = 'http://web.archive.org/web/20230127223105/https://www.theguardian.com/uk'

#Download de los links de los distintos temas
html_data = requests.get(UK_news_url).text
soup = BeautifulSoup(html_data, 'html.parser')

# START OF UPDATE IF NEEDED ---------------------
url_topics = [f'https://www.theguardian.com{el.find("a")["href"]}' for el in soup.find_all(class_='dcr-4hq641')[1:8]]
topics = [el.text.strip('\n').replace(' ','_') for el in soup.find_all(class_ = 'dcr-4hq641')[1:8]]
# END OF UPDATE IF NEEDED ---------------------

for i in range(len(topics)):
    print('Topic {}: {} ({})'.format(i+1,topics[i],url_topics[i]))

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
def save_to_txt(filename, content):
    '''
    Creates a new .txt file with as specific name in the Data directory
    '''
    with open(r"Data/{}.txt".format(filename), "w") as f:
        print(content, file=f)

# Cria-se um diretório onde serão salvos os artigos
# os.mkdir('C:/Users/Usuario/Desktop/FORMACION/CURSO MIT/CLUSTERING/DATA')
os.mkdir('Data/')

In [None]:
# Function to request and check if url exists
def make_request_with_retry(url, max_retries=3):
    for _ in range(max_retries):
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            print(f"Error making request to {url}: {e}")
            time.sleep(5)
    return None

In [None]:
article_titles = []
article_contents = []
article_topics = []
articles_per_topic = 15
n = 1

for topic, url_topic in zip(topics, url_topics):
    # Getting the first 15
    soup = BeautifulSoup(requests.get(url_topic).text, 'html.parser')

    # START OF UPDATE IF NEEDED ---------------------
    url_articles = [f'https://www.theguardian.com{el.find("a")["href"]}' for el in soup.find_all(class_='dcr-c7jt3v')]
    # END OF UPDATE IF NEEDED ---------------------

    print('\n{}:'.format(topic))

    i = 0
    while article_topics.count(topic) < articles_per_topic:
        # Check if i is within the range of url_articles
        if i >= len(url_articles):
            print('Only {} articles found in "{}"'.format(article_topics.count(topic), topic))
            break

        try:
            # soup = BeautifulSoup(requests.get(url_articles[i]).text, 'html.parser')
            soup = BeautifulSoup(make_request_with_retry(url_articles[i]).text, 'html.parser')

            # START OF UPDATE IF NEEDED ---------------------
            title = soup.find(class_='dcr-cohhs3').text.strip('\n')
            content = ' '.join([el.text for el in soup.find_all('p')])
            # END OF UPDATE IF NEEDED ---------------------

            i += 1

            if title not in article_titles:
                article_titles += [title]
                article_contents += [content]
                article_topics += [topic]
                save_to_txt('title-{}'.format(n), title)
                save_to_txt('article-{}'.format(n), content)
                save_to_txt('topic-{}'.format(n), topic)
                print('{}'.format(title))
                n += 1

                if round(len(article_titles) / 10) == len(article_titles) / 10:
                    print('Article count: {}'.format(len(article_titles)))

        except AttributeError:  # Catch AttributeError when title is not found
            print('Failed to extract article from {}'.format(url_articles[i]))
            i += 1  # Move to the next url_articles

df = pd.DataFrame({'topic':article_topics,'title':article_titles,'content':article_contents})
df

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
df.groupby('topic').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

# Importación de la base de datos

Una vez tenemos la base de datos guardada en carpeta deseada, podemos usar el código del caso de estudio para importar la información.

In [None]:
#número total de artigos a serem processados
N = df.shape[0]
#para armazenar os temas, títulos e conteúdos das notícias:
topics_array = []
titles_array = []
corpus = []
for i in range(1, N+1):
    #obtenha o conteúdo do artigo.
    with open('Data/article-' + str(i) + '.txt', 'r') as myfile:
        d1=myfile.read().replace('\n', '')
        d1 = d1.lower()
        corpus.append(d1)
    #obtenha o tema original do artigo.
    with open('Data/topic-' + str(i) + '.txt', 'r') as myfile:
        to1=myfile.read().replace('\n', '')
        to1 = to1.lower()
        topics_array.append(to1)
    #obtenha o título do artigo.
    with open('Data/title-' + str(i) + '.txt', 'r') as myfile:
        ti1=myfile.read().replace('\n', '')
        ti1 = ti1.lower()
        titles_array.append(ti1)

# Generación de atributos

Para generar los atributos de cada instancia (artículo):

1. Enlazamos todos los corpus de texto de artículos para determinar todas las palabras únicas que se utilizan en el conjunto de datos.
2. Buscamos el subconjunto de las entidades del modelo NER que se encuentra entre las palabras únicas que se utilizan en el conjunto de datos (determinado en el paso 1).

In [None]:
#vetor de subconjunto de entidades
entity_text_array = []
for i in range(1, N+1):
    #carregue o arquivo de texto con o conteúdo do artigo e converta-o em uma lista de palavras
    tokens = tokenize(load_entire_file(('Data/article-' + str(i) + '.txt')))
    #extraia todas as entidades conhecidas do modelo ner mencionado neste artigo
    entities = ner.extract_entities(tokens)
    #extraia as palavras de entidades reais adicione-as ao vetor
    for e in entities:
        range_array = e[0]
        tag = e[1]
        score = e[2]
        score_text = "{:0.3f}".format(score)
        entity_text = " ".join(tokens[j].decode("utf-8") for j in range_array)
        entity_text_array.append(entity_text.lower())
#elimine as entidades duplicadas que foram detectadas
#entity_text_array = np.unique(entity_text_array)
entity_text_array = list(set(entity_text_array))


Ahora que ya tenemos la lista de todas las entidades utilizadas en la base de datos, podemos representar cada artículo como un vector que contiene la puntuación de [TF-IDF](https://en.wikipedia.org/wiki/Tf–idf) para cada entidad almacenada en el `entity_text_array`. Esta tarea se puede realizar fácilmente con la librería [scikit-learn](http://scikit-learn.org/stable/) de Python

In [None]:
vect = TfidfVectorizer(sublinear_tf=True, max_df=0.5, analyzer='word',
                       stop_words='english', vocabulary=entity_text_array)
corpus_tf_idf = vect.fit_transform(corpus)

Ahora que tenemos los artículos representados por sus atributos (puntuaciones de TF-IDF), podemos llevar a cabo el agrupamiento espectral de los mismos usando la librería `scikit-learn` de nuevo

In [None]:
#Altere n_clusters para o número de grupos desejados
n_clusters = 8
#Agrupamento espectral
spectral = cluster.SpectralClustering(n_clusters= n_clusters,
                                      eigen_solver='arpack',
                                      affinity="nearest_neighbors",
                                      n_neighbors = 10)
spectral.fit(corpus_tf_idf)

Finalmente las siguientes líneas de código permiten ver el output en el siguiente formato (una línea por artículo):

<br>

__no. artículo, tema, cluster, título__

In [None]:
if hasattr(spectral, 'labels_'):
    cluster_assignments = spectral.labels_.astype(int)
    for i in range(0, len(cluster_assignments)):
        print(i, topics_array[i], cluster_assignments[i], titles_array[i])

In [None]:
df['predictions'] = cluster_assignments
predictions_df = pd.get_dummies(df, columns=['predictions']).drop(['title','content'],axis=1).groupby(['topic']).sum()
predictions_df