# 2nd Exercise

Prepared by: **Hardian Lawi**

In [0]:
import re
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.linear_model import LogisticRegression

# Implement Tf-Idf manually

In this section, we will implement Tf-Idf method manually to better understand the process. Also, instead of using huge public dataset, here we will start by using a small corpus to check our implementation.

In [0]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

## Tokenize corpus

In [3]:
rx = re.compile(r'\w+')
tokenized_corpus = [rx.findall(x.lower()) for x in corpus]
tokenized_corpus

[['this', 'is', 'the', 'first', 'document'],
 ['this', 'document', 'is', 'the', 'second', 'document'],
 ['and', 'this', 'is', 'the', 'third', 'one'],
 ['is', 'this', 'the', 'first', 'document']]

Store some useful variables for later use

In [4]:
# Store size of vocabulary and number of documents
vocabs = set([x for l in tokenized_corpus for x in l])
vocab_size = len(vocabs)
no_docs = len(tokenized_corpus)

# Create a mapping between token and a unique integer
token2id = dict(zip(sorted(vocabs), range(vocab_size)))
token2id

{'and': 0,
 'document': 1,
 'first': 2,
 'is': 3,
 'one': 4,
 'second': 5,
 'the': 6,
 'third': 7,
 'this': 8}

## Term-frequency

$\text{tf}(t, d)$ is the number of times a term $t$ occurs in a given document $d$.

In [5]:
def generate_tf(tokenized_corpus, token2id):
  tf = np.zeros((len(tokenized_corpus), len(token2id)), dtype=int)
  for i in range(len(tokenized_corpus)):
    token_counts = Counter(tokenized_corpus[i])
    for t, c in token_counts.most_common():
        tf[i, token2id[t]] = c
  return tf
    
tf = generate_tf(tokenized_corpus, token2id)
tf

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

## Inverse document frequency

$$\text{idf}(t) = \log{\frac{n}{\text{df}(t)}} + 1$$

where $n$ is the total number of documents in the document set, and $\text{df}(t)$ is the number of documents in the document set that contain term $t$.

In [6]:
# Start by computing df
df = np.zeros((1, vocab_size), dtype=int)
for i in range(no_docs):
  unique_tokens = set(tokenized_corpus[i])
  for t in unique_tokens:
    df[0, token2id[t]] += 1
    
df

array([[1, 3, 2, 4, 1, 1, 4, 1, 4]])

In [7]:
idf = np.zeros((1, vocab_size), dtype=float)
for i in range(vocab_size):
  idf[0, i] = np.log(no_docs / df[0, i]) + 1
  
idf

array([[2.38629436, 1.28768207, 1.69314718, 1.        , 2.38629436,
        2.38629436, 1.        , 2.38629436, 1.        ]])

## Tf-Idf representation of corpus

$$\text{tf-idf}(t, d) = \text{tf}(t, d) \times \text{idf}(t)$$

In [8]:
tfidf = tf * idf
tfidf

array([[0.        , 1.28768207, 1.69314718, 1.        , 0.        ,
        0.        , 1.        , 0.        , 1.        ],
       [0.        , 2.57536414, 0.        , 1.        , 0.        ,
        2.38629436, 1.        , 0.        , 1.        ],
       [2.38629436, 0.        , 0.        , 1.        , 2.38629436,
        0.        , 1.        , 2.38629436, 1.        ],
       [0.        , 1.28768207, 1.69314718, 1.        , 0.        ,
        0.        , 1.        , 0.        , 1.        ]])

## Final Touch

To avoid the numbers to explode when dealing with huge corpus, we could scale the resulting vectors by Euclidean norm:

$$v_{norm} =  \frac{v}{||v||_2} = \frac{v}{\sqrt{v_1^2 + v_2^2 + \cdots + v_n^2}}$$

where $v \in \mathbb{R}^d$ for some $d$ and $v_i$ is the $i$-th element of $v$.

In [9]:
tfidf = tfidf / np.sqrt((tfidf ** 2).sum(axis=1, keepdims=True))
tfidf

array([[0.        , 0.46941728, 0.61722732, 0.3645444 , 0.        ,
        0.        , 0.3645444 , 0.        , 0.3645444 ],
       [0.        , 0.65782665, 0.        , 0.25543054, 0.        ,
        0.60953246, 0.25543054, 0.        , 0.25543054],
       [0.53248519, 0.        , 0.        , 0.22314313, 0.53248519,
        0.        , 0.22314313, 0.53248519, 0.22314313],
       [0.        , 0.46941728, 0.61722732, 0.3645444 , 0.        ,
        0.        , 0.3645444 , 0.        , 0.3645444 ]])


## Verify with scikit-learn

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(smooth_idf=False)
sci_tfidf = vectorizer.fit_transform(corpus).toarray()
sci_tfidf

array([[0.        , 0.46941728, 0.61722732, 0.3645444 , 0.        ,
        0.        , 0.3645444 , 0.        , 0.3645444 ],
       [0.        , 0.65782665, 0.        , 0.25543054, 0.        ,
        0.60953246, 0.25543054, 0.        , 0.25543054],
       [0.53248519, 0.        , 0.        , 0.22314313, 0.53248519,
        0.        , 0.22314313, 0.53248519, 0.22314313],
       [0.        , 0.46941728, 0.61722732, 0.3645444 , 0.        ,
        0.        , 0.3645444 , 0.        , 0.3645444 ]])

See how easy it is to apply Tf-idf using `scikit-learn`. It is always recommended to use the built-in library because it is more efficient and convenient.

In [0]:
assert np.isclose(tfidf, sci_tfidf).all()

# Apply Tf-idf on Public Dataset

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
%%bash

wget -qO yelp_review_polarity_csv.tgz https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz
tar -xvzf yelp_review_polarity_csv.tgz
ls

yelp_review_polarity_csv/
yelp_review_polarity_csv/train.csv
yelp_review_polarity_csv/readme.txt
yelp_review_polarity_csv/test.csv
sample_data
yelp_review_polarity_csv
yelp_review_polarity_csv.tgz


In [0]:
train = pd.read_csv('yelp_review_polarity_csv/train.csv', names=["label", "text"])
test = pd.read_csv('yelp_review_polarity_csv/test.csv', names=["label", "text"])

# Sample
train = train.head(50000)
test = test.head(10000)

In [15]:
vectorizer = CountVectorizer()

count_X_train, y_train = vectorizer.fit_transform(train.text), train.label - 1
count_X_test, y_test = vectorizer.transform(test.text), test.label - 1

print('train size:', count_X_train.shape)
print('test size:', count_X_test.shape)

model = LogisticRegression()
model = model.fit(count_X_train, y_train)

print('\nCount Representation performance')
print('Train acc:', (model.predict(count_X_train) == y_train).mean())
print('Test acc:', (model.predict(count_X_test) == y_test).mean())

train size: (50000, 60652)
test size: (10000, 60652)





Count Representation performance
Train acc: 0.98464
Test acc: 0.9205


In [0]:
del count_X_train, count_X_test

In [17]:
vectorizer = TfidfVectorizer()

tfidf_X_train, y_train = vectorizer.fit_transform(train.text), train.label - 1
tfidf_X_test, y_test = vectorizer.transform(test.text), test.label - 1

print('train size:', tfidf_X_train.shape)
print('test size:', tfidf_X_test.shape)

model = LogisticRegression()
model = model.fit(tfidf_X_train, y_train)

print('\nTF Idf Representation performance')
print('Train acc:', (model.predict(tfidf_X_train) == y_train).mean())
print('Test acc:', (model.predict(tfidf_X_test) == y_test).mean())

train size: (50000, 60652)
test size: (10000, 60652)





TF Idf Representation performance
Train acc: 0.94094
Test acc: 0.9202


# Bonus

In [18]:
id2token = dict(zip(vectorizer.vocabulary_.values(), vectorizer.vocabulary_.keys()))

for i in np.argsort(model.coef_)[0][-10:]:
  print(id2token[i])

good
perfect
fantastic
excellent
awesome
best
love
amazing
delicious
great


As explained, the model here performs better than word embeddings because the feature representation is suitable for the task (Sentiment Prediction). To be more concrete, when the reviews are positive, the document will usually contain positive words such as above. Since both TF and TF-idf both use each unique token as a feature, our model could easily learn this relationship and give high positive weights to all positive words.