Importing Libraries

In [1]:
import pandas as pd
import spacy
import nltk
import math
import numpy as np

nlp = spacy.load("en_core_web_sm")


In [2]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KawaiiAtomic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from collections import defaultdict


Importing stemmer for easy implementation of stemmation

In [4]:
from nltk.stem import PorterStemmer

nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()

Importing Dataset stored in local directory

In [5]:
df = pd.read_csv("Articles.csv")
df

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business
...,...,...,...,...
2687,strong>DUBAI: Dubai International Airport and ...,3/25/2017,Laptop ban hits Dubai for 11m weekend traveller,business
2688,"strong>BEIJING: Former Prime Minister, Shaukat...",3/26/2017,Pak China relations not against any third coun...,business
2689,strong>WASHINGTON: Uber has grounded its fleet...,3/26/2017,Uber grounds self driving cars after accid,business
2690,strong>BEIJING: The New Development Bank plans...,3/27/2017,New Development Bank plans joint investments i...,business


In [6]:
df.tail(10)

Unnamed: 0,Article,Date,Heading,NewsType
2682,strong>BEIJING: Beijing's last large coal-fire...,3/19/2017,Beijing becomes Chinas first city with all cle...,business
2683,strong>WASHINGTON: The World Bank on Sunday an...,3/20/2017,World Bank announces 57 bln in financing for Afri,business
2684,"strong>LONDON: Arcelik, the home appliances ar...",3/21/2017,For Turkish giant Arcelik Pakistan is among fo...,business
2685,strong>RIYADH/DUBAI: Saudi Arabia plans to tig...,3/21/2017,Saudis to tighten curbs on foreign workers in ...,business
2686,strong>BEIJING: A Chinese rail company has won...,3/25/2017,Chinese train manufacturer wins railcar bid in US,business
2687,strong>DUBAI: Dubai International Airport and ...,3/25/2017,Laptop ban hits Dubai for 11m weekend traveller,business
2688,"strong>BEIJING: Former Prime Minister, Shaukat...",3/26/2017,Pak China relations not against any third coun...,business
2689,strong>WASHINGTON: Uber has grounded its fleet...,3/26/2017,Uber grounds self driving cars after accid,business
2690,strong>BEIJING: The New Development Bank plans...,3/27/2017,New Development Bank plans joint investments i...,business
2691,strong>KARACHI: Karachi-based technology incub...,3/27/2017,Google powered Startup Weekend energizing prou...,business


Removing 'strong>' from the dataset

In [7]:
df['Article'] = df['Article'].str.replace(r'strong>', '', regex=True)


In [8]:
df

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business
...,...,...,...,...
2687,DUBAI: Dubai International Airport and its fla...,3/25/2017,Laptop ban hits Dubai for 11m weekend traveller,business
2688,"BEIJING: Former Prime Minister, Shaukat Aziz s...",3/26/2017,Pak China relations not against any third coun...,business
2689,WASHINGTON: Uber has grounded its fleet of sel...,3/26/2017,Uber grounds self driving cars after accid,business
2690,BEIJING: The New Development Bank plans to co-...,3/27/2017,New Development Bank plans joint investments i...,business


Using numpy for tokenization, nlp for text tokenization

In [None]:
def tokenize(text):
    if pd.isna(text):
        return []
    doc = nlp(text)      
    return [token.text for token in doc if not token.is_punct]

In [10]:
df['Heading_Tokens'] = df['Heading'].apply(tokenize)

In [11]:
df['Article_Tokens'] = df['Article'].apply(tokenize)


In [12]:
df

Unnamed: 0,Article,Date,Heading,NewsType,Heading_Tokens,Article_Tokens
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business,"[sindh, govt, decides, to, cut, public, transp...","[KARACHI, The, Sindh, government, has, decided..."
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business,"[asia, stocks, up, in, new, year, trad]","[HONG, KONG, Asian, markets, started, 2015, on..."
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business,"[hong, kong, stocks, open, 0.66, percent, lower]","[HONG, KONG, , Hong, Kong, shares, opened, 0...."
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business,"[asian, stocks, sink, euro, near, nine, year]","[HONG, KONG, Asian, markets, tumbled, Tuesday,..."
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business,"[us, oil, prices, slip, below, 50, a, barr]","[NEW, YORK, US, oil, prices, Monday, slipped, ..."
...,...,...,...,...,...,...
2687,DUBAI: Dubai International Airport and its fla...,3/25/2017,Laptop ban hits Dubai for 11m weekend traveller,business,"[Laptop, ban, hits, Dubai, for, 11, m, weekend...","[DUBAI, Dubai, International, Airport, and, it..."
2688,"BEIJING: Former Prime Minister, Shaukat Aziz s...",3/26/2017,Pak China relations not against any third coun...,business,"[Pak, China, relations, not, against, any, thi...","[BEIJING, Former, Prime, Minister, Shaukat, Az..."
2689,WASHINGTON: Uber has grounded its fleet of sel...,3/26/2017,Uber grounds self driving cars after accid,business,"[Uber, grounds, self, driving, cars, after, ac...","[WASHINGTON, Uber, has, grounded, its, fleet, ..."
2690,BEIJING: The New Development Bank plans to co-...,3/27/2017,New Development Bank plans joint investments i...,business,"[New, Development, Bank, plans, joint, investm...","[BEIJING, The, New, Development, Bank, plans, ..."


Normalization (Lowering the letters of tokens, replacing capital letters)

In [13]:
def lowercase_tokens(tokens):
    return [tok.lower() for tok in tokens]


In [14]:
df['Article_Tokens'] = df['Article_Tokens'].apply(lowercase_tokens)

df['Heading_Tokens'] = df['Heading_Tokens'].apply(lowercase_tokens)

In [15]:
df

Unnamed: 0,Article,Date,Heading,NewsType,Heading_Tokens,Article_Tokens
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business,"[sindh, govt, decides, to, cut, public, transp...","[karachi, the, sindh, government, has, decided..."
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business,"[asia, stocks, up, in, new, year, trad]","[hong, kong, asian, markets, started, 2015, on..."
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business,"[hong, kong, stocks, open, 0.66, percent, lower]","[hong, kong, , hong, kong, shares, opened, 0...."
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business,"[asian, stocks, sink, euro, near, nine, year]","[hong, kong, asian, markets, tumbled, tuesday,..."
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business,"[us, oil, prices, slip, below, 50, a, barr]","[new, york, us, oil, prices, monday, slipped, ..."
...,...,...,...,...,...,...
2687,DUBAI: Dubai International Airport and its fla...,3/25/2017,Laptop ban hits Dubai for 11m weekend traveller,business,"[laptop, ban, hits, dubai, for, 11, m, weekend...","[dubai, dubai, international, airport, and, it..."
2688,"BEIJING: Former Prime Minister, Shaukat Aziz s...",3/26/2017,Pak China relations not against any third coun...,business,"[pak, china, relations, not, against, any, thi...","[beijing, former, prime, minister, shaukat, az..."
2689,WASHINGTON: Uber has grounded its fleet of sel...,3/26/2017,Uber grounds self driving cars after accid,business,"[uber, grounds, self, driving, cars, after, ac...","[washington, uber, has, grounded, its, fleet, ..."
2690,BEIJING: The New Development Bank plans to co-...,3/27/2017,New Development Bank plans joint investments i...,business,"[new, development, bank, plans, joint, investm...","[beijing, the, new, development, bank, plans, ..."


Removing stop words such as of,to... etc.

In [16]:
def remove_stopwords(tokens):
    return [tok for tok in tokens if tok.lower() not in stop_words]

In [17]:
df['Article_Tokens'] = df['Article_Tokens'].apply(remove_stopwords)
df['Heading_Tokens'] = df['Heading_Tokens'].apply(remove_stopwords)


In [18]:
df

Unnamed: 0,Article,Date,Heading,NewsType,Heading_Tokens,Article_Tokens
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business,"[sindh, govt, decides, cut, public, transport,...","[karachi, sindh, government, decided, bring, p..."
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business,"[asia, stocks, new, year, trad]","[hong, kong, asian, markets, started, 2015, up..."
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business,"[hong, kong, stocks, open, 0.66, percent, lower]","[hong, kong, , hong, kong, shares, opened, 0...."
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business,"[asian, stocks, sink, euro, near, nine, year]","[hong, kong, asian, markets, tumbled, tuesday,..."
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business,"[us, oil, prices, slip, 50, barr]","[new, york, us, oil, prices, monday, slipped, ..."
...,...,...,...,...,...,...
2687,DUBAI: Dubai International Airport and its fla...,3/25/2017,Laptop ban hits Dubai for 11m weekend traveller,business,"[laptop, ban, hits, dubai, 11, weekend, travel...","[dubai, dubai, international, airport, flag, c..."
2688,"BEIJING: Former Prime Minister, Shaukat Aziz s...",3/26/2017,Pak China relations not against any third coun...,business,"[pak, china, relations, third, country, shauka...","[beijing, former, prime, minister, shaukat, az..."
2689,WASHINGTON: Uber has grounded its fleet of sel...,3/26/2017,Uber grounds self driving cars after accid,business,"[uber, grounds, self, driving, cars, accid]","[washington, uber, grounded, fleet, self, driv..."
2690,BEIJING: The New Development Bank plans to co-...,3/27/2017,New Development Bank plans joint investments i...,business,"[new, development, bank, plans, joint, investm...","[beijing, new, development, bank, plans, co, f..."


Lemmatization/Stemmation

In [19]:
def lemmatize_and_stem(tokens):
    doc = nlp(" ".join(tokens))  
    processed_tokens = []

    for token in doc:
        lemma = token.lemma_.lower()  
        stemmed = stemmer.stem(lemma) 
        processed_tokens.append(stemmed)

    return processed_tokens

In [20]:
df['Article_Tokens'] = df['Article_Tokens'].apply(lemmatize_and_stem)
df['Heading_Tokens'] = df['Heading_Tokens'].apply(lemmatize_and_stem)


In [21]:
df

Unnamed: 0,Article,Date,Heading,NewsType,Heading_Tokens,Article_Tokens
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business,"[sindh, govt, decid, cut, public, transport, f...","[karachi, sindh, govern, decid, bring, public,..."
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business,"[asia, stock, new, year, trad]","[hong, kong, asian, market, start, 2015, upsw,..."
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business,"[hong, kong, stock, open, 0.66, percent, lower]","[hong, kong, , hong, kong, share, open, 0.66..."
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business,"[asian, stock, sink, euro, near, nine, year]","[hong, kong, asian, market, tumbl, tuesday, fo..."
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business,"[we, oil, price, slip, 50, barr]","[new, york, us, oil, price, monday, slip, $, 5..."
...,...,...,...,...,...,...
2687,DUBAI: Dubai International Airport and its fla...,3/25/2017,Laptop ban hits Dubai for 11m weekend traveller,business,"[laptop, ban, hit, dubai, 11, weekend, travel]","[dubai, dubai, intern, airport, flag, carrier,..."
2688,"BEIJING: Former Prime Minister, Shaukat Aziz s...",3/26/2017,Pak China relations not against any third coun...,business,"[pak, china, relat, third, countri, shaukat, a...","[beij, former, prime, minist, shaukat, aziz, s..."
2689,WASHINGTON: Uber has grounded its fleet of sel...,3/26/2017,Uber grounds self driving cars after accid,business,"[uber, ground, self, drive, car, accid]","[washington, uber, ground, fleet, self, drive,..."
2690,BEIJING: The New Development Bank plans to co-...,3/27/2017,New Development Bank plans joint investments i...,business,"[new, develop, bank, plan, joint, invest, econ...","[beij, new, develop, bank, plan, co, financ, i..."


Indicing -> Inverted Index

In [22]:
def build_inverted_index(df, column="Article_Tokens"):
    index = defaultdict(dict)

    for doc_id, tokens in enumerate(df[column]):
        for term in tokens:
            if term not in index or doc_id not in index[term]:
                index[term][doc_id] = 0
            index[term][doc_id] += 1  
    
    return index

In [23]:
inverted_index = build_inverted_index(df)
print("Total indexed terms:", len(inverted_index))

Total indexed terms: 26421


In [24]:
#inverted_index


Computing IDF scores

In [25]:

def compute_idf(index, total_docs):
    idf = {}
    for term, doc_dict in index.items():
        df = len(doc_dict) 
        idf[term] = math.log((total_docs / df), 10) 
    return idf

In [26]:
total_docs = len(df)   

In [27]:
idf_scores = compute_idf(inverted_index, total_docs)
#idf_scores

Computing tf scores

In [28]:
def compute_tf(index):
    tf = {} 
    for term, doc_dict in index.items():
        tf[term] = {}
        for doc, freq in doc_dict.items():
            tf[term][doc] = freq  
    return tf


In [29]:
tf_scores = compute_tf(inverted_index)
#tf_scores

Computing tf-idf scores

In [None]:
def compute_tf_idf(tf_scores, idf_scores):
    
    tf_idf = {}

    for term in tf_scores:
        tf_idf[term] = {}

        for doc, tf_val in tf_scores[term].items():
            tf_idf[term][doc] = tf_val * idf_scores[term]  # TF-IDF score

    return tf_idf




In [31]:
tf_idf_matrix = compute_tf_idf(tf_scores, idf_scores)
#tf_idf_matrix

Building Document Vectors

In [None]:

def build_document_vectors(tf_idf_matrix, total_docs):
    vocab = list(tf_idf_matrix.keys())                 
    vectors = {doc: np.zeros(len(vocab)) for doc in range(total_docs)}
    for term_index, term in enumerate(vocab):
        for doc, score in tf_idf_matrix[term].items():

            vectors[doc][term_index] = score          

    return vectors, vocab

document_vectors, vocabulary = build_document_vectors(tf_idf_matrix, total_docs)
#document_vectors


Norm imported from linalg(linear algebra) package of numpy for cosine similarity

In [None]:
from numpy.linalg import norm


In [34]:

def cosine_similarity(vec_a, vec_b):
    if norm(vec_a) == 0 or norm(vec_b) == 0:
        return 0
    return np.dot(vec_a, vec_b) / (norm(vec_a) * norm(vec_b))


Rankinf Documents based on relevance

In [42]:
def rank_documents(query_tokens, document_vectors, tf_idf_matrix, vocabulary):
    query_vec = np.zeros(len(vocabulary))

    for token in query_tokens:
        if token in tf_idf_matrix:
            term_index = vocabulary.index(token)
            query_vec[term_index] = sum(tf_idf_matrix[token].values())

    #cosine similarity function called below
    similarities = []
    for doc_id, doc_vec in document_vectors.items():
        score = cosine_similarity(query_vec, doc_vec)
        similarities.append((doc_id, score))

    ranked = sorted(similarities, key=lambda x: x[1], reverse=True)
    return ranked


User inputs query here

In [None]:
query = input("Enter search query: ").lower().split()

In [68]:
query

['islamabad']

Ranks documents and returns result based on the user input query

In [69]:
# query = lemmatize_and_stem(remove_stopwords(lowercase_tokens(query)))

results = rank_documents(query, document_vectors, tf_idf_matrix, vocabulary)

print("Top Ranked Documents:")
for rank, (doc, score) in enumerate(results[:5], start=1):  
    print(rank, "Document ID :", doc, "having score = ", score)

    print("→ Heading:", df.iloc[doc]['Heading'])
    print("→ Article:", df.iloc[doc]['Article'][:200], "...")
    print("--------------------------------------------------")


Top Ranked Documents:
1 Document ID : 1164 having score =  0.2023861045326962
→ Heading: Wright steers Quetta to easy win against Islamabad in PSL opener
→ Article: DUBAI: England�s T20 expert Luke Wright scored impressive 86 to steer Quetta Gladiators to their first triumph against Islamabad United at the opening match of Pakistan Super League (PSL) by 8 wickets ...
--------------------------------------------------
2 Document ID : 428 having score =  0.18944555521336057
→ Heading: Ishaq Dar formally launches Pakistan Stock Exchang
→ Article: ISLAMABAD: Pakistan stock Exchange (PSX) has been formally launched after integration of all the three bourses of the country.</strongMinister for Finance Ishaq Dar on Monday inaugurated the PSX, form ...
--------------------------------------------------
3 Document ID : 1277 having score =  0.18435470956574163
→ Heading: Karachi eliminated Islamabad to play Peshawar
→ Article: DUBAI: Islamabad United defeated Karachi Kings by nine wickets in the