<a href="https://colab.research.google.com/github/mayarachew/NLP/blob/main/Exerc%C3%ADcio_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recuperação de Textos

## Importação de bibliotecas

In [1]:
import nltk
import pandas as pd
import numpy as np
import re
import math

from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('reuters')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Definição do dataset

In [2]:
cats = reuters.categories()
print("Reuters has %d categories:\n%s" % (len(cats), cats))

fileids = reuters.fileids()

Reuters has 90 categories:
['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [3]:
categories = []
text = []

for file in fileids:
    categories.append(reuters.categories(file))
    text.append(reuters.raw(file))

raw_df = pd.DataFrame({'ids':fileids, 'categories':categories, 'text':text})
raw_df

Unnamed: 0,ids,categories,text
0,test/14826,[trade],ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...
1,test/14828,[grain],CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...
2,test/14829,"[crude, nat-gas]",JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...
4,test/14833,"[palm-oil, veg-oil]",INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...
...,...,...,...
10783,training/999,"[interest, money-fx]",U.K. MONEY MARKET SHORTAGE FORECAST REVISED DO...
10784,training/9992,[earn],KNIGHT-RIDDER INC &lt;KRN> SETS QUARTERLY\n Q...
10785,training/9993,[earn],TECHNITROL INC &lt;TNL> SETS QUARTERLY\n Qtly...
10786,training/9994,[earn],NATIONWIDE CELLULAR SERVICE INC &lt;NCEL> 4TH ...


## Pré-processamento

In [4]:
def preprocessing(text):
    # regex
    pp_text = re.sub(r':\)|\(:', 'happy', text)
    pp_text = re.sub(r':\(|\):', 'sad', pp_text)
    pp_text = re.sub(r':D', 'excited', pp_text)
    pp_text = re.sub(r'D:', 'sorrowful', pp_text)
    pp_text = re.sub(r'#', '', pp_text)
    pp_text = re.sub(r'\$[\d.,]*', 'price', pp_text)
    pp_text = re.sub(r'[^ ]*(\’m)', 'I am', pp_text)
    pp_text = re.sub(r'\’t', ' not', pp_text)
    pp_text = re.sub(r'its', ' it’s', pp_text)
    pp_text = re.sub(r'http:[^ ]*', '', pp_text)
    pp_text = re.sub(r'@[^ ]*', '', pp_text)
    pp_text = re.sub(r'!', ' important ', pp_text)
    pp_text = re.sub(r'\.\.\.', ' etc ', pp_text)
    pp_text = re.sub(r'[^\w\s]', '', pp_text)
    pp_text = re.sub(r'  *', ' ', pp_text)
    pp_text = re.sub(r'\n', ' ', pp_text)
    pp_text = pp_text.lower()

    # tokenization
    pp_token_text = word_tokenize(pp_text)

    # remove stopwords
    pp_token_text = [word for word in pp_token_text if word not in stopwords.words('english')]

    # lemmatization
    wl = WordNetLemmatizer()
    pp_token_text = [wl.lemmatize(word) for word in pp_token_text]
    return pp_text

In [5]:
texts_list = []
corpus = ''
num_samples = 100

for text in raw_df['text'][:num_samples]:
  pp_text = preprocessing(text)
  texts_list.append(pp_text)
  corpus += pp_text+''

tokens = corpus.split()
vocab = sorted(set(tokens))
all_vocab = sorted(tokens)

df = pd.DataFrame()
df['ids'] = raw_df['ids'][:num_samples]
df['categories'] = raw_df['categories'][:num_samples]
df['text'] = texts_list

df.head(3)

Unnamed: 0,ids,categories,text
0,test/14826,[trade],asian exporters fear damage from usjapan rift ...
1,test/14828,[grain],china daily says vermin eat 712 pct grain stoc...
2,test/14829,"[crude, nat-gas]",japan to revise longterm energy demand downwar...


## Definição do Term Frequency (BoW ponderado)

In [6]:
tf = {}
N = {}
category_dict = dict()

for idx, text in enumerate(df['text']):
  category_dict[f'sentenca {idx+1}'] = df['categories'][idx]
  tf[f'sentenca {idx+1}'] = dict()
  N[f'sentenca {idx+1}'] = len(text.split())
  for word in text.split():
    if word in tf[f'sentenca {idx+1}']:
      tf[f'sentenca {idx+1}'][word] += 1
    else:
      tf[f'sentenca {idx+1}'][word] = 1

for text in tf:
  for word in tf[text]:
    tf[text][word] /= N[text]

## Definição do Inverse Document Frequency

In [7]:
bow_contagem = dict()

for idx, text in enumerate(df['text']):
  bow_contagem[f'sentenca {idx+1}'] = dict()
  for word in text.split():
    if word in bow_contagem[f'sentenca {idx+1}']:
        bow_contagem[f'sentenca {idx+1}'][word] += 1
    else:
        bow_contagem[f'sentenca {idx+1}'][word] = 1

idf = {}

for word in vocab:
  idf[word] = 0
  # count how many documents have this word
  for text in bow_contagem:
    if word in bow_contagem[text]:
      idf[word] += 1

for word in idf:
    idf[word] = math.log(len(df['text'])/idf[word])

## Criação do vetor TF-IDF

In [8]:
tfidf = {}

for text in tf:
    tfidf[text] = dict()
    for word in tf[text]:
        tfidf[text][word] = tf[text][word]*idf[word]

tfidf = pd.DataFrame().from_records(tfidf).fillna(0).T

tfidf.head(3)

Unnamed: 0,asian,exporters,fear,damage,from,usjapan,rift,mounting,trade,friction,...,suger,allotted,dairy,cereal,moving,jp,morgan,128,granted,westgermany
sentenca 1,0.012918,0.019377,0.006459,0.009836,0.004071,0.006459,0.006459,0.006459,0.041363,0.005487,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sentenca 10,0.0,0.0,0.0,0.0,0.005759,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sentenca 100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cálculo da acurácia

In [9]:
def cosine_similarity(vetor1, vetor2):
    inner_prod = 0

    for i,valor in enumerate(vetor1):
        inner_prod += valor*vetor2[i]

    norma_vetor1 = math.sqrt(sum([x**2 for x in vetor1])) 
    norma_vetor2 = math.sqrt(sum([x**2 for x in vetor2])) 
    
    return inner_prod/(norma_vetor1*norma_vetor2)

In [10]:
all_top_10_lists = []

for idx, row in tfidf.iterrows():
  top_list = []
  for idx_to_compare, row_to_compare in tfidf.iterrows():
    top_list.append({'original_idx':idx,
                     'original_category':category_dict[idx],
                     'idx':idx_to_compare, 
                     'category': category_dict[idx_to_compare],
                     'cosine similarity':cosine_similarity(tfidf.loc[idx].to_numpy(), tfidf.loc[idx_to_compare].to_numpy())})
  top_list = sorted(top_list, key=lambda d: d['cosine similarity'], reverse=True) 
  all_top_10_lists.append(top_list[1:11])

all_top_10_lists[0]

[{'category': ['carcass',
   'corn',
   'grain',
   'livestock',
   'oilseed',
   'rice',
   'soybean',
   'trade'],
  'cosine similarity': 0.37554316738869936,
  'idx': 'sentenca 15',
  'original_category': ['trade'],
  'original_idx': 'sentenca 1'},
 {'category': ['trade'],
  'cosine similarity': 0.29071959423829363,
  'idx': 'sentenca 43',
  'original_category': ['trade'],
  'original_idx': 'sentenca 1'},
 {'category': ['trade'],
  'cosine similarity': 0.2902351538083828,
  'idx': 'sentenca 28',
  'original_category': ['trade'],
  'original_idx': 'sentenca 1'},
 {'category': ['trade'],
  'cosine similarity': 0.28631936592880064,
  'idx': 'sentenca 39',
  'original_category': ['trade'],
  'original_idx': 'sentenca 1'},
 {'category': ['bop', 'trade'],
  'cosine similarity': 0.2710753329175149,
  'idx': 'sentenca 19',
  'original_category': ['trade'],
  'original_idx': 'sentenca 1'},
 {'category': ['groundnut'],
  'cosine similarity': 0.19548618497450362,
  'idx': 'sentenca 60',
  'ori

In [11]:
all_top_10 = pd.DataFrame()

for i in range(len(df)):
  new_df = pd.DataFrame.from_dict(all_top_10_lists[i])
  all_top_10 = pd.concat([all_top_10, new_df])

print('Top 10 textos mais semelhantes ao texto \'sentença 1:\'')
all_top_10.head(10)

Top 10 textos mais semelhantes ao texto 'sentença 1:'


Unnamed: 0,original_idx,original_category,idx,category,cosine similarity
0,sentenca 1,[trade],sentenca 15,"[carcass, corn, grain, livestock, oilseed, ric...",0.375543
1,sentenca 1,[trade],sentenca 43,[trade],0.29072
2,sentenca 1,[trade],sentenca 28,[trade],0.290235
3,sentenca 1,[trade],sentenca 39,[trade],0.286319
4,sentenca 1,[trade],sentenca 19,"[bop, trade]",0.271075
5,sentenca 1,[trade],sentenca 60,[groundnut],0.195486
6,sentenca 1,[trade],sentenca 33,"[interest, money-fx]",0.194726
7,sentenca 1,[trade],sentenca 53,"[dlr, money-fx]",0.161649
8,sentenca 1,[trade],sentenca 34,"[cpi, gnp]",0.158633
9,sentenca 1,[trade],sentenca 10,[acq],0.140618


In [12]:
original_idx_list = all_top_10['original_idx'].unique()
original_idx_list

array(['sentenca 1', 'sentenca 10', 'sentenca 100', 'sentenca 11',
       'sentenca 12', 'sentenca 13', 'sentenca 14', 'sentenca 15',
       'sentenca 16', 'sentenca 17', 'sentenca 18', 'sentenca 19',
       'sentenca 2', 'sentenca 20', 'sentenca 21', 'sentenca 22',
       'sentenca 23', 'sentenca 24', 'sentenca 25', 'sentenca 26',
       'sentenca 27', 'sentenca 28', 'sentenca 29', 'sentenca 3',
       'sentenca 30', 'sentenca 31', 'sentenca 32', 'sentenca 33',
       'sentenca 34', 'sentenca 35', 'sentenca 36', 'sentenca 37',
       'sentenca 38', 'sentenca 39', 'sentenca 4', 'sentenca 40',
       'sentenca 41', 'sentenca 42', 'sentenca 43', 'sentenca 44',
       'sentenca 45', 'sentenca 46', 'sentenca 47', 'sentenca 48',
       'sentenca 49', 'sentenca 5', 'sentenca 50', 'sentenca 51',
       'sentenca 52', 'sentenca 53', 'sentenca 54', 'sentenca 55',
       'sentenca 56', 'sentenca 57', 'sentenca 58', 'sentenca 59',
       'sentenca 6', 'sentenca 60', 'sentenca 61', 'sentenca 62',


In [13]:
# Criação da tabela de acurácia
acc_table = pd.DataFrame(np.zeros((1, 10)), columns=['top_1','top_2','top_3','top_4','top_5','top_6','top_7','top_8','top_9','top_10'])

for original_idx in original_idx_list:
  new_df = all_top_10.loc[all_top_10['original_idx'] == original_idx]
  original_category = new_df['original_category'][0][0]

  if original_category in new_df['category'][0]:
    acc_table['top_1'] += 1
  if original_category in new_df['category'][1]:
    acc_table['top_2'] += 1
  if original_category in new_df['category'][2]:
    acc_table['top_3'] += 1
  if original_category in new_df['category'][3]:
    acc_table['top_4'] += 1
  if original_category in new_df['category'][4]:
    acc_table['top_5'] += 1
  if original_category in new_df['category'][5]:
    acc_table['top_6'] += 1
  if original_category in new_df['category'][6]:
    acc_table['top_7'] += 1
  if original_category in new_df['category'][7]:
    acc_table['top_8'] += 1
  if original_category in new_df['category'][8]:
    acc_table['top_9'] += 1
  if original_category in new_df['category'][9]:
    acc_table['top_10'] += 1

acc_table

Unnamed: 0,top_1,top_2,top_3,top_4,top_5,top_6,top_7,top_8,top_9,top_10
0,71.0,60.0,50.0,48.0,45.0,42.0,45.0,42.0,33.0,35.0
