#Import Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt') #Tokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv


File ‘bbc_text_cls.csv’ already there; not retrieving.



In [3]:
data = pd.read_csv('bbc_text_cls.csv')
data.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [4]:
idx = 0
word2idx = {}
tokenized_docs = []
for doc in data['text']:
  tokens = word_tokenize(doc.lower())
  doc_as_ints = []
  for token in tokens:
    if token not in word2idx:
      word2idx[token] = idx
      idx += 1
    doc_as_ints.append(word2idx[token])
  tokenized_docs.append(doc_as_ints)



In [5]:
idx2word = {idx: token for token, idx in word2idx.items()}

In [6]:
N = len(data['text']) #number of documents
V = len(word2idx) #number of terms

In [7]:
#intialize term-frequency matrix
tf = np.zeros((N, V))
#populate tf counts
for i, doc in enumerate(tokenized_docs):
  for term_idx in doc:
    tf[i, term_idx] += 1


In [10]:
#compute idf
doc_freqs = np.sum(tf > 0, axis=0)
idf = np.log(N / doc_freqs)

In [11]:
#compute tf-idf
tf_idf = tf * idf

In [12]:
np.random.seed(123)

In [16]:
#pick a random document,show top 5 terms according to the tf-idf score
i = np.random.choice(N)
row = data.iloc[i]

print("Label: ", row['labels'])
print("Text: \n", row['text'].split('\n')[0])
print("Top 5 terms:")
scores = tf_idf[i]
indices = (-scores).argsort()[:5]
for idx in indices:
  print(idx2word[idx], scores[idx])

Label:  tech
Text: 
 IBM puts cash behind Linux push
Top 5 terms:
linux 42.47693537441576
ibm 32.00412585069834
workplace 17.747258176116855
software 16.813424496971464
programs 15.345244734769798
