# Find Keywords and Classify Website with TF-IDF and GENSIM
Input:
```
    URL
```
Output:
```
    Main output: Sport-site or Non-sport site. 
    Sub-output: vocabulary list.
```
File used:
```
link.txt
```

In [1]:
from bs4 import BeautifulSoup # work with html
import requests 

import re #regular expression

import string #to remove punctuation
from nltk.corpus import stopwords #get stopwords
from nltk.stem import WordNetLemmatizer #lemmatise words

#tf-idf libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

The 3 following functions are to trim the scraped texts. 

In [2]:
def tag_to_string(url):
    '''
        Return the Tag-type data found to String-type. 
    '''
    page = requests.get(url)
    parsing = BeautifulSoup(page.text, "html.parser")
    texts_tag = parsing.findAll('title') + parsing.findAll('p')
    texts = str()
    for text in texts_tag:
        texts += str(text) + ' '
    return texts

def tag_removal(texts):
    '''
        Find all the tag of <.*>
    '''
    pattern = r"<.*?>"
    findings = re.findall(pattern, texts)
    return findings

def get_texts(texts):
    '''
        Replace all the tag of <.*> to get texts
    '''
    findings = tag_removal(texts)
    for finding in findings:
        texts = texts.replace(finding, "")
    return texts       

This function is to preprocess the data

In [3]:
import string #to remove punctuation
from nltk.corpus import stopwords #get stopwords
from nltk.stem import WordNetLemmatizer #lemmatise words

def preprocess(text_data):
    '''
        Remove unwanted and preprocess data
    '''
    #lower text
    text_data = text_data.lower()
    
    #remove punctuation
    text_data = text_data.translate(str.maketrans('', '', string.punctuation))
    
    #remove stopwords
    stop_words = stopwords.words('english')
    for stopword in stop_words:
        pattern = ' ' + stopword + ' '
        try:
            text_data = text_data.replace(pattern, " ")
        except:
            None
    
    
    #remove numbers
    text_data = re.sub(r"[0-9]", "", text_data)
    
    #remove big whitespace
    text_data = re.sub(r"\s{2,}", " ", text_data)
    
    #remove non-latin character
    text_data = re.sub(r"[^a-zA-Z\s]", "", text_data)
    
    #lemmatize
    text_data = text_data.split(' ')
    lmt = WordNetLemmatizer() 
    text_data = list(map(lmt.lemmatize, text_data))
    
    
    return ' '.join(text_data)

In [4]:
def corpus(urls):
    '''
        Create a corpus for TF-IDF
    '''
    
    docs = list()
    
    for url in urls:
        texts = tag_to_string(url)
        findings = tag_removal(texts)
        texts = get_texts(texts)
        docs.append(preprocess(texts))
    
    return docs

Open links.txt file that contains links from BBC Sports

In [5]:
with open("links.txt", "r") as f:
    links = f.readlines()

docs = corpus(links)

Create a vocabulary list by ignoring the one that appears in 80% of the text (`max_df=0.8`)

In [6]:
#remove common words
cv = CountVectorizer(max_df=0.8)

#generate a td-matrix
word_count_vector = cv.fit_transform(docs)

#get feature names
feature_names = cv.get_feature_names()

Compute the TF-IDF values

In [7]:
tfidf_trans = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_trans.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

Function to sort the the matrix and extract the number of keywords

In [8]:
def sort_coo(coo_matrix):
    '''
        Sorting the data from coo matrix by descending order
    '''
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def topn_from_vector(feature_names, sorted_items, topn=15):
    '''
        Extract the top n keywords
    '''
    sorted_items = sorted_items[:topn]
    
    scores, features = list(), list()
    
    for i, score in sorted_items:
        
        scores.append(round(score, 3))
        features.append(feature_names[i])
        
    results = dict()
    for i in range(len(features)):
        results[features[i]] = scores[i]
        
    return features, scores

Due to small sample size, let us compute the TF-IDF values and extract the keywords from each of the document from the corpus.

This will be our set of keywords.

In [9]:
terms = set()
for doc in docs:
    #tf-idf for the given document
    tfidf_vector = tfidf_trans.transform(cv.transform([doc]))
    
    #sorting by descending order
    sorted_items = sort_coo(tfidf_vector.tocoo())
    
    #extract the keyword and its score
    kw, scores = topn_from_vector(feature_names, sorted_items, 15)
    
    #generate a set of vocabularies
    for k in kw:
        terms.add(k)

Testing with another website that is not in the links.txt

In [10]:
url2 = "https://www.bbc.com/sport/football/53788177"
texts = tag_to_string(url2)
findings = tag_removal(texts)
texts = get_texts(texts)
texts = preprocess(texts)

Generate keywords usign TF-IDF for the testing input

In [11]:
tfidf_vector = tfidf_trans.transform(cv.transform([texts]))
sorted_items = sort_coo(tfidf_vector.tocoo())
kw, scores = topn_from_vector(feature_names, sorted_items, 15)

# see the matching keywords
for k in kw:
    if k in terms:
        print(k)

barcelona
bayern
world
football


Generate keywords using Gensim

In [12]:
# gensim library to find out the keywords of a respective doc
from gensim.summarization import keywords    



In [13]:
rr = keywords(texts, words = 15, scores = True, lemmatize = True)

# check the matching keywords
for r in rr:
    if r[0] in terms:
        print(r)

('barcelona', 0.17417648493105373)
('world', 0.1564135563343885)
('game', 0.1542339617996264)
('football', 0.13645411832181842)


## Classification based on the dictionary

Due to small corpus, as long as >= 3 keywords from an article of choice match with the set of vocabularies, it can be induced that the website belongs to BBC Sport.

Using TF-IDF technique:

In [14]:
count = int()
for k in kw:
    if k in terms:
        count += 1
if count >= 3:
    print('This is a Sport page.')

This is a Sport page.
