## Importing libraries

In [1]:
import numpy as np
import pandas as pd

import nltk
from nltk import word_tokenize

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nirajan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Importing dataset

In [3]:
df = pd.read_csv('datasets/bbc_text_cls.csv')

In [4]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [5]:
df.shape

(2225, 2)

## TF (Count Vectorizer)

In [6]:
current_count = 0
word2idx = {}
tokenized_docs = []
for doc in df['text']:
    tokens = word_tokenize(doc.lower())
    doc_as_int = []
    for word in tokens:
        if word not in word2idx:
            word2idx[word] = current_count
            current_count += 1
        doc_as_int.append(word2idx[word])

    tokenized_docs.append(doc_as_int)

In [7]:
word2idx

{'ad': 0,
 'sales': 1,
 'boost': 2,
 'time': 3,
 'warner': 4,
 'profit': 5,
 'quarterly': 6,
 'profits': 7,
 'at': 8,
 'us': 9,
 'media': 10,
 'giant': 11,
 'timewarner': 12,
 'jumped': 13,
 '76': 14,
 '%': 15,
 'to': 16,
 '$': 17,
 '1.13bn': 18,
 '(': 19,
 '£600m': 20,
 ')': 21,
 'for': 22,
 'the': 23,
 'three': 24,
 'months': 25,
 'december': 26,
 ',': 27,
 'from': 28,
 '639m': 29,
 'year-earlier': 30,
 '.': 31,
 'firm': 32,
 'which': 33,
 'is': 34,
 'now': 35,
 'one': 36,
 'of': 37,
 'biggest': 38,
 'investors': 39,
 'in': 40,
 'google': 41,
 'benefited': 42,
 'high-speed': 43,
 'internet': 44,
 'connections': 45,
 'and': 46,
 'higher': 47,
 'advert': 48,
 'said': 49,
 'fourth': 50,
 'quarter': 51,
 'rose': 52,
 '2': 53,
 '11.1bn': 54,
 '10.9bn': 55,
 'its': 56,
 'were': 57,
 'buoyed': 58,
 'by': 59,
 'one-off': 60,
 'gains': 61,
 'offset': 62,
 'a': 63,
 'dip': 64,
 'bros': 65,
 'less': 66,
 'users': 67,
 'aol': 68,
 'on': 69,
 'friday': 70,
 'that': 71,
 'it': 72,
 'owns': 73,
 '8

In [8]:
idx2word = [key for key,value in word2idx.items()]

In [9]:
word2idx['filed']

487

In [10]:
idx2word[487]

'filed'

In [11]:
N = df.shape[0]
N

2225

In [12]:
V = len(word2idx)
V

34762

In [13]:
tf = np.zeros((N,V))
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i, j] += 1

In [14]:
tf

array([[1., 4., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 1., 1.]])

## IDF

In [15]:
doc_freq = np.sum(tf>0, axis=0)
doc_freq

array([ 12, 204, 127, ...,   1,   1,   1])

In [16]:
idf = np.log(N/doc_freq)
idf

array([5.22260554, 2.3893922 , 2.86332511, ..., 7.70751219, 7.70751219,
       7.70751219])

## TF-IDF

In [17]:
tf_idf = tf*idf

In [18]:
tf_idf

array([[5.22260554, 9.5575688 , 2.86332511, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 2.86332511, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 7.70751219, 7.70751219,
        7.70751219]])

In [19]:
i = np.random.choice(N)
row = df.iloc[i]
print("Label:", row['labels'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = tf_idf[i]
indices = (-scores).argsort()

for j in indices[:5]:
    print(idx2word[j])

Label: tech
Text: Junk e-mails on relentless rise
Top 5 terms:
email
spam
systems
traffic
denial


### Exercise: use CountVectorizer to form the counts instead

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
count_vec = CountVectorizer()

In [22]:
X = count_vec.fit_transform(df['text'])

In [23]:
X.toarray()

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
feature_name = count_vec.get_feature_names_out()
feature_name

array(['00', '000', '0001', ..., 'zutons', 'zvonareva', 'zvyagintsev'],
      dtype=object)

In [25]:
# word2idx
count_vec.vocabulary_

{'ad': 1750,
 'sales': 23062,
 'boost': 4349,
 'time': 26665,
 'warner': 28502,
 'profit': 20825,
 'quarterly': 21204,
 'profits': 20832,
 'at': 2994,
 'us': 27901,
 'media': 17064,
 'giant': 11826,
 'timewarner': 26677,
 'jumped': 14933,
 '76': 1249,
 'to': 26730,
 '13bn': 184,
 '600m': 1105,
 'for': 11102,
 'the': 26462,
 'three': 26566,
 'months': 17692,
 'december': 7736,
 'from': 11377,
 '639m': 1136,
 'year': 29256,
 'earlier': 9183,
 'firm': 10842,
 'which': 28749,
 'is': 14510,
 'now': 18557,
 'one': 18837,
 'of': 18726,
 'biggest': 3963,
 'investors': 14406,
 'in': 13801,
 'google': 12066,
 'benefited': 3816,
 'high': 13002,
 'speed': 24836,
 'internet': 14299,
 'connections': 6653,
 'and': 2429,
 'higher': 13005,
 'advert': 1885,
 'said': 23041,
 'fourth': 11224,
 'quarter': 21203,
 'rose': 22793,
 '11': 106,
 '1bn': 461,
 '10': 68,
 '9bn': 1466,
 'its': 14571,
 'were': 28694,
 'buoyed': 4868,
 'by': 4980,
 'off': 18730,
 'gains': 11534,
 'offset': 18758,
 'dip': 8384,
 'bros

### Exercise (hard): use Scipy's csr_matrix instead
### You cannot use X[i, j] += 1 here

In [26]:
from scipy.sparse import csr_matrix

In [27]:
# Create a CSR matrix for term frequency
rows = []
cols = []
data = []

for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        rows.append(i)
        cols.append(j)
        data.append(1)

tf_csr = csr_matrix((data, (rows, cols)), shape=(N, V)).toarray()

In [28]:
tf_csr

array([[1, 4, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 1]], dtype=int32)