<a href="https://colab.research.google.com/github/marcondesc/mcws_wasa/blob/main/TF_IDF_N_Gramas_Redes_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


* Pré-processamento de Textos e Representação com Bag-of-Words
* Medida TFIDF
* N-gramas
* Redes k-NN e Agrupamento



# Instalando e Importando Bibliotecas


### Pandas e Numpy

In [1]:
import pandas as pd
import numpy as np

### NLTK
* Apoio ao pré-processamento de textos (tokenização, stopwords, radicalização)

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('rslp')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.


### Sklearn
* Construção do Modelo Espaço-Vetorial
* Medidas de Similaridade

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import kneighbors_graph

### Networkx e Plotly
* Construção de Redes k-NN
* Visualização Interativa de Grafos

In [4]:
import plotly.graph_objects as go
import networkx as nx
from networkx.algorithms import community

### Métodos de apoio

In [70]:
def remove_stopwords(text,stop_words):

  # tudo para caixa baixa
  s = str(text).lower()

  tokens = word_tokenize(s)

  # remove stopwords, dígitos, caracteres especiais e pontuações
  v = [word for word in tokens if not word in stop_words and word.isalnum() and not word.isdigit()]

  return v

def stemming(tokens,stemmer):
  tokens_stems = [stemmer.stem(word) for word in tokens]
  return tokens_stems

def meu_tokenizador(doc, stop_words=nltk.corpus.stopwords.words('english'), stemmer=PorterStemmer()):
  tokens = remove_stopwords(doc,stop_words)
  return stemming(tokens,stemmer)


def get_cluster_descriptors(VSM, df_documentos, cluster_id, max_terms=3):
  df_descritors = pd.DataFrame()
  df_descritors['word'] = VSM.get_feature_names_out()
  df_descritors['tfidf_sum'] = VSM.transform(df_documentos[df_documentos.cluster==cluster_id]['Title']).toarray().sum(axis=0)
  df_descritors.sort_values(by='tfidf_sum',ascending=False,inplace=True)

  num_docs = len(df_documentos[df_documentos.cluster==cluster_id]['Title'])
  descriptors =  df_descritors[df_descritors.tfidf_sum > 0].head(max_terms).word.to_list()

  return num_docs,descriptors

# Usando TFIDF na ponderação dos termos


### Sobre a base textual
* Amostra textual sobre projetos de pesquisa que receberam investimento da NSF (National Science Foundation)

https://archive.ics.uci.edu/ml/datasets/NSF+Research+Award+Abstracts+1990-2003

### Lendo a base textual

In [53]:
#df_documentos = pd.read_csv('nsf_data.csv')
df = pd.read_csv('/content/scopus_1_418_docs.csv')
df.reset_index(inplace=True,drop=True)
df_documentos = df['Title']

### Modelo Espaço Vetorial + TFIDF
* Ponderação TFIDF
* Corte por DF (Document Frequency)

In [54]:
VSM = TfidfVectorizer(tokenizer=meu_tokenizador,min_df=3)
X = VSM.fit_transform(df_documentos)



In [55]:
X

<452x268 sparse matrix of type '<class 'numpy.float64'>'
	with 2702 stored elements in Compressed Sparse Row format>

In [56]:
df_word_tfidfs = pd.DataFrame()
df_word_tfidfs['word'] = VSM.get_feature_names_out()
df_word_tfidfs['tfidf_sum'] = X.toarray().sum(axis=0)
df_word_tfidfs.sort_values(by='tfidf_sum',ascending=False,inplace=True)
df_word_tfidfs.head(50)

Unnamed: 0,word,tfidf_sum
70,digit,50.075094
251,transform,37.361065
110,govern,36.880715
157,manag,16.360523
62,data,13.149181
199,public,12.45229
131,innov,11.245073
127,inform,10.876736
166,model,10.736095
246,technolog,10.272529


# Analisando N-gramas

#### Bigramas

In [57]:
VSM = TfidfVectorizer(tokenizer=meu_tokenizador,min_df=3,ngram_range=(2,2))
X = VSM.fit_transform(df_documentos)

df_bigrams_tfidfs = pd.DataFrame()
df_bigrams_tfidfs['word'] = VSM.get_feature_names_out()
df_bigrams_tfidfs['tfidf_sum'] = X.toarray().sum(axis=0)
df_bigrams_tfidfs.sort_values(by='tfidf_sum',ascending=False,inplace=True)
df_bigrams_tfidfs.head(50)



Unnamed: 0,word,tfidf_sum
33,digit transform,80.01394
25,digit govern,18.649013
42,govern digit,12.735056
87,transform govern,8.516999
9,case studi,7.770124
73,smart citi,6.637325
69,public servic,6.504847
89,transform public,6.470297
53,inform system,6.323796
57,literatur review,6.222763


#### Trigramas

In [58]:
VSM = TfidfVectorizer(tokenizer=meu_tokenizador,min_df=3,ngram_range=(3,3))
X = VSM.fit_transform(df_documentos)

df_trigrams_tfidfs = pd.DataFrame()
df_trigrams_tfidfs['word'] = VSM.get_feature_names_out()
df_trigrams_tfidfs['tfidf_sum'] = X.toarray().sum(axis=0)
df_trigrams_tfidfs.sort_values(by='tfidf_sum',ascending=False,inplace=True)
df_trigrams_tfidfs.head(50)



Unnamed: 0,word,tfidf_sum
6,digit transform govern,10.0
10,govern digit transform,9.286314
18,systemat literatur review,8.0
8,digit transform public,6.95411
11,higher educ institut,5.0
9,digit transform strategi,4.0
14,manag digit transform,3.400034
7,digit transform project,3.159415
5,digit transform conceptu,3.0
1,adopt digit transform,3.0


#### Unigramas + Bigramas + Trigramas

In [59]:
VSM = TfidfVectorizer(tokenizer=meu_tokenizador,min_df=3,ngram_range=(1,3))
X = VSM.fit_transform(df_documentos)



# Gerando nossa rede k-NN

In [60]:
VSM = TfidfVectorizer(tokenizer=meu_tokenizador,min_df=2,ngram_range=(2,2))
X = VSM.fit_transform(df_documentos)

In [61]:
A = kneighbors_graph(X, n_neighbors=3, metric="cosine")

In [62]:
G = nx.Graph(A)

### Identificando estruturas na rede usando label propagation

In [63]:
cluster_id = 0
for clusters in community.label_propagation_communities(G):
  for doc_id in clusters:
    G.nodes[doc_id]['cluster'] = cluster_id
  cluster_id +=1

In [64]:
L_clusters = []
# Since df_documentos is a Series, iterate using its index
for index in df_documentos.index:
  L_clusters.append(G.nodes[index]['cluster'])
df_documentos = pd.DataFrame(df_documentos)  # Convert Series to DataFrame
df_documentos['cluster'] = L_clusters
df_documentos

Unnamed: 0,Title,cluster
0,Development of an assessment model for industr...,0
1,Web-enabled supply chain management: Key antec...,1
2,It consumerization and the transformation of i...,2
3,E-government in Canada: Transformation for the...,3
4,Digital transformation in latecomer industries...,4
...,...,...
447,"Standing Conference of Eastern, Central, and S...",0
448,"Lecture Notes in Informatics (LNI), Proceeding...",48
449,From territories to tourist areas: Ending some...,0
450,PERIKLIS - electronic democracy in the 21st ce...,0


In [84]:
df_documentos['cluster'] = L_clusters
df_documentos_ordenados = df_documentos.sort_values(by='cluster', ascending=False)
df_documentos_ordenados
#high_cooccurrences_df = high_cooccurrences_df.sort_values(by="Coocorrência", ascending=False).reset_index(drop=True)

Unnamed: 0,Title,cluster
448,"Lecture Notes in Informatics (LNI), Proceeding...",48
444,ACM International Conference Proceeding Series,48
419,ACM International Conference Proceeding Series,48
434,ACM International Conference Proceeding Series,48
413,"19th IFIP WG 6.11 Conference on e-Business, e-...",47
...,...,...
349,C﻿hief digital officers: the state of the art ...,0
259,The Norwegian COVID-19 Tracing App Experiment:...,0
351,COGNITIVE RISKS,0
131,The Enterprise Imaging Value Proposition,0


### Selecionando os termos com maiores TFIDF (Term Frequency-Inverse Document Frequency) de um cluster

In [66]:
df_documentos[df_documentos.cluster==17]

Unnamed: 0,Title,cluster
29,Business performance management models based o...,17
191,Data analysis model design of health service m...,17
428,Integrated data analytics and visualization fo...,17


In [71]:
get_cluster_descriptors(VSM, df_documentos, 17)

(3, ['perform manag', 'model base', 'data analyt'])

In [72]:
qtd_topics = 15

L = []
for cluster in df_documentos.cluster.unique():
  num_docs, descriptors = get_cluster_descriptors(VSM, df_documentos, cluster)
  L.append([cluster,num_docs,descriptors])

df_descriptors = pd.DataFrame(L)
df_descriptors.columns = ['cluster','num_docs','descriptors']
df_descriptors.sort_values(by='num_docs',ascending=False).head(qtd_topics)

Unnamed: 0,cluster,num_docs,descriptors
0,0,151,"[emerg technolog, big data, design implement]"
10,10,41,"[digit transform, transform innov, respons digit]"
16,16,21,"[govern digit, digit transform, case studi]"
22,22,13,"[digit govern, govern transform, govern transit]"
1,1,11,"[suppli chain, digit busi, dynam capabl]"
2,2,10,"[transform govern, digit transform, govern soc..."
19,19,10,"[digit transform, transform conceptu, adapt go..."
8,8,9,"[inform system, improv govern, 18th european]"
15,15,9,"[transform public, digit transform, public ser..."
12,12,8,"[framework digit, transform strategi, digit tr..."


#### Mantendo apenas os documentos dos clusters selecionados
* O objetivo aqui é ressaltar os tópicos/temas mais relevantes da base de dados, conforme a estrutura da rede K-NN

In [73]:
selected_clusters = df_descriptors.sort_values(by='num_docs',ascending=False).head(qtd_topics).cluster.to_list()
G2 = G.copy()
for node in G.nodes():
  if G.nodes[node]['cluster'] not in selected_clusters:
    G2.remove_node(node)

In [74]:
pos = nx.spring_layout(G2,seed=42) # obtém coordenadas dos vértices para visualização
for node in G2.nodes():
  G2.nodes[node]['pos'] = pos[node]

In [76]:
for index,row in df_documentos.iterrows(): # adiciona um texto em cada vértice.
  if index in G2.nodes:
    cluster_descriptor = str(df_descriptors[df_descriptors.cluster==G2.nodes[index]['cluster']].descriptors.to_list()[0])
    G2.nodes[index]['Title'] = cluster_descriptor+"<br>"+str(row['Title'])

##### Código para visualização interativa do grafo.
Fonte: https://plotly.com/python/network-graphs/

In [81]:
def show_graph(G):
  ### ARESTAS
  edge_x = []
  edge_y = []

  # adicionando as coordenadas
  for edge in G.edges():
      x0, y0 = G.nodes[edge[0]]['pos']
      x1, y1 = G.nodes[edge[1]]['pos']
      edge_x.append(x0)
      edge_x.append(x1)
      edge_x.append(None)
      edge_y.append(y0)
      edge_y.append(y1)
      edge_y.append(None)

  # definindo cor e estilo das arestas
  edge_trace = go.Scatter(
      x=edge_x, y=edge_y,
      line=dict(width=2, color='#888'),
      hoverinfo='none',
      mode='lines')

  ### VÉRTICES
  node_x = []
  node_y = []

  # adicionando as coordenadas
  for node in G.nodes():
      x, y = G.nodes[node]['pos']
      node_x.append(x)
      node_y.append(y)

  # definindo cor e estilo dos vértices
  node_trace = go.Scatter(
      x=node_x, y=node_y,
      mode='markers',
      hoverinfo='text',
      marker=dict(
          size=10,
          line_width=2))


  # adicionando texto nos vértices
  node_text = []
  for node in G.nodes():
      node_text.append(G.nodes[node]['Title'])
  node_trace.text = node_text

  # adicionando cores nos vértices de acordo com o cluster
  node_labels = []
  for node in G.nodes():
    node_labels.append(G.nodes[node]['cluster'])

  node_trace.marker.color = node_labels

  # visualizando!
  fig = go.Figure(data=[edge_trace, node_trace],
              layout=go.Layout(
                  showlegend=False,
                  hovermode='closest',
                  margin=dict(b=20,l=5,r=5,t=40),
                  xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                  yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                  )
  fig.show()

### Visualização

In [82]:
show_graph(G2)