The **TF-IDF** scheme is a type of bag words approach where instead of adding zeros and ones in the embedding vector, you add floating numbers that contain more useful information compared to zeros and ones. 

The idea behind TF-IDF scheme is the fact that words having a high frequency of occurrence in one document, and less frequency of occurrence in all the other documents, are more crucial for classification.

TF-IDF is a product of two values: Term Frequency (TF) and Inverse Document Frequency (IDF).

**Term frequency** refers to the number of times a word appears in the document and can be calculated as:

##### Term frequence = (Number of Occurences of a word)/(Total words in the document)</i>

**IDF** refers to the log of the total number of documents divided by the number of documents in which the word exists, and can be calculated as:

##### IDF(word) = Log((Total number of documents)/(Number of documents containing the word))

**Disadvantages:**
Though TF-IDF is an improvement over the simple bag of words approach and yields better results for common NLP tasks, the overall pros and cons remain the same. We still need to create a huge sparse matrix, which also takes a lot more computation than the simple bag of words approach.

In [1]:
import string
import random
import numpy as np
import scipy as sp
import urllib.request
import nltk
import re
import heapq
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

def get_article_text(url):
    # Scrape article using bs4 to extract all paragraphs from the online article.
    raw_html = urllib.request.urlopen(url)
    raw_html = raw_html.read()

    article_html = BeautifulSoup(raw_html, 'lxml')
    article_paragraphs = article_html.find_all('p')

    # Creating a document 'article_text' containing all the sentences in the article.
    article_text = ''
    for para in article_paragraphs:
        article_text += para.text
    return article_text

def remove_stopwords(sentence):
    filtered_sentence = []
    stop_words = nltk.corpus.stopwords.words('english')
    word_tokens = nltk.word_tokenize(sentence)
    for token in word_tokens:
        if token not in stop_words:
            filtered_sentence.append(token)
    filtered_sentence = ' '.join(filtered_sentence)
    return filtered_sentence

def lemmatize(sentence):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_sentence = []
    word_tokens = nltk.word_tokenize(sentence)
    for token in word_tokens:
        lemm_token = lemmatizer.lemmatize(token)
        lemmatized_sentence.append(lemm_token)
    lemmatized_sentence = ' '.join(lemmatized_sentence)
    return lemmatized_sentence

def clean_sentence(sentence):
    sentence = sentence.lower()
    sentence = remove_stopwords(sentence)
    sentence = re.sub(r'\W', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = lemmatize(sentence)
    return sentence

def clean_article_text(article_text):
    # Creating a corpus containing all the sentence tokens in the document.
    corpus = nltk.sent_tokenize(article_text)
    # Convert to lowercase, remove non-word characters (punctuations, etc.) and strip whitespaces
    for i in range(len(corpus)):
        corpus[i] = clean_sentence(corpus[i])
    return corpus

def get_most_freq_tokens(corpus):
    # Create dictionary with word frequency
    word_freq = defaultdict(int)
    for sentence in corpus:
        word_tokens = nltk.word_tokenize(sentence)
        for token in word_tokens:
            word_freq[token] += 1
    most_freq_tokens = heapq.nlargest(200, word_freq, key=word_freq.get)
    return most_freq_tokens
  

# IDF = log((Total number of sentences (documents)) 
# divided by
# (Number of sentences (documents) containing the word))
def compute_idf_values(corpus, most_freq_tokens):
    word_idf_values = {}
    for token in most_freq_tokens:
        sentences_with_word = 0
        for sentence in corpus:
            if token in nltk.word_tokenize(sentence):
                sentences_with_word += 1
        word_idf_values[token] = np.log(len(corpus)/(sentences_with_word + 1))
    return word_idf_values

# TF = (Frequency of the word in the sentence) / (Total number of words in the sentence)
# Unlike IDF values, TF values of a word are different for each sentence in the corpus.
# IDF values of a word are the same for each sentence.
def compute_tf_values(corpus, most_freq_tokens):
    word_tf_values = {}
    for token in most_freq_tokens:
        sent_tf_vector = []
        for sentence in corpus:
            wordfreq_in_sent = 0
            for word_token in nltk.word_tokenize(sentence):
                if word_token == token:
                    wordfreq_in_sent += 1
            word_tf = wordfreq_in_sent/len(nltk.word_tokenize(sentence))
            sent_tf_vector.append(word_tf)

        # Storing the tf values of a word for each sentence in the corpus.
        word_tf_values[token] = sent_tf_vector
    return word_tf_values

def compute_tfidf_model(corpus, most_freq_tokens):
    word_tf_values = compute_tf_values(corpus, most_freq_tokens)
    word_idf_values = compute_idf_values(corpus, most_freq_tokens)
    tfidf_values = []
    
    for token in word_tf_values.keys():
        tfidf_for_word_sentscores = []
        for sent_tf_vector in word_tf_values[token]:
            tfidf_scores = sent_tf_vector * word_idf_values[token]
            tfidf_for_word_sentscores.append(tfidf_scores)
        tfidf_values.append(tfidf_for_word_sentscores)

    tfidf_model = np.asarray(tfidf_values)
    tfidf_model = np.transpose(tfidf_model)
    return tfidf_model

In [2]:
def get_tuned_model(url):
    article_text = get_article_text(url)
    initial_corpus = nltk.sent_tokenize(article_text)
    corpus = clean_article_text(article_text)
    most_freq_tokens = get_most_freq_tokens(corpus)

    tuned_tfidf_model = compute_tfidf_model(corpus, most_freq_tokens)
    return tuned_tfidf_model

In [3]:
def get_answer(question, url, tuned_tfidf_model):
    article_text = get_article_text(url)
    initial_corpus = nltk.sent_tokenize(article_text)
    corpus = clean_article_text(article_text)
    most_freq_tokens = get_most_freq_tokens(corpus)
    
    cleaned_question = clean_sentence(question)
    question_vector = compute_tfidf_model([cleaned_question], most_freq_tokens)
    
    similarity_scores = []
    sent_vec_index = 0
    for sent_vec in tuned_tfidf_model:
        similarity = 1 - sp.spatial.distance.cosine(question_vector, sent_vec)
        similarity_scores.append((sent_vec_index, similarity))
        sent_vec_index += 1
    similarity_scores.sort(key = lambda x: x[1], reverse=True)
    answer_index = similarity_scores[0][0]
    answer = initial_corpus[answer_index]
    return answer

In [4]:
url = 'https://en.wikipedia.org/wiki/India'
tuned_tfidf_model = get_tuned_model(url)

In [5]:
question = 'What did ancient greeks refer to Indians as?'
answer = get_answer(question, url, tuned_tfidf_model)
print(answer)



India (Hindi: Bhārat), officially the Republic of India (Hindi: Bhārat Gaṇarājya),[20] is a country in South Asia.
