In [None]:
import streamlit as st
import re
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from PyPDF2 import PdfReader
import requests
from bs4 import BeautifulSoup

downloading stop words from the nltk library

In [None]:
nltk.download("stopwords")
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
stopwords_list = set(stopwords.words("english")).union(
    {
        "things",
        "that's",
        "something",
        "take",
        "don't",
        "may",
        "want",
        "you're",
        "set",
        "might",
        "says",
        "including",
        "lot",
        "much",
        "said",
        "know",
        "good",
        "step",
        "often",
        "going",
        "thing",
        "think",
        "back",
        "actually",
        "better",
        "look",
        "find",
        "right",
        "example",
    }
)

### Preprocessing text

In [None]:

def preprocess_text(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)  # Replace non-ASCII characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[%s]" % re.escape(string.punctuation), " ", text)  # Remove punctuation
    text = re.sub(r"[0-9]", "", text)  # Remove numbers
    text = re.sub(r"\s{2,}", " ", text)  # Remove extra whitespace
    return text

### Extract text from PDF

In [None]:
def extract_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return preprocess_text(text)

### Vectorize text for similarity search

In [None]:
def vectorize_text(documents):
    vectorizer = TfidfVectorizer(
        analyzer="word",
        ngram_range=(1, 2),
        stop_words=stopwords_list,
        max_features=10000,
    )
    X = vectorizer.fit_transform(documents)
    return X, vectorizer

In [None]:
lemmer=WordNetLemmatizer()
new_docs=[' '.join([lemmer.lemmatize(docs) for docs in text.split(',')]) for text in docs]  #Lemmatization the words/description
titles = [' '.join([lemmer.lemmatize(title).strip() for title in text.split(' ')]) for text in title]   #Lemmatization the title

In [None]:
print(new_docs)

['i loved you ethiopian  stored elements in Compress find Sparse Ethiopia is the greatest country in the world of nation at universe', 'also  sometimes  the same words can have multiple different ‘lemma’s. So  based on the context it’s used  you should identify the         part-of-speech (POS) tag for the word in that specific context and extract the appropriate lemma. Examples of implementing this comes         in the following sections countries.ethiopia With a planned.The name that the Blue Nile river loved took in Ethiopia is derived from the         Geez word for great to imply its being the river of rivers The word Abay still exists in ethiopia major languages', 'With more than  million people  ethiopia is the second most populous nation in Africa after Nigeria  and the fastest growing          economy in the region. However  it is also one of the poorest  with a per capita income', 'The primary purpose of the dam ethiopia is electricity production to relieve Ethiopia’s acute ene

In [None]:
english_stopset = list(stopwords.words('english'))

In [None]:
vectorizer = TfidfVectorizer(analyzer='word',
                              ngram_range=(1, 2),
                              min_df=0.002,
                              max_df=0.99,
                              max_features=10000,
                              lowercase=True,
                              stop_words=english_stopset)

In [None]:
X = vectorizer.fit_transform(new_docs)

In [None]:
# Create a DataFrame
df = pd.DataFrame(X.T.toarray())
print(df.head(10))
print(df.shape)

     0         1         2         3         4    5
0  0.0  0.085345  0.000000  0.000000  0.233406  0.0
1  0.0  0.000000  0.000000  0.000000  0.142318  0.0
2  0.0  0.085345  0.000000  0.000000  0.116703  0.0
3  0.0  0.000000  0.000000  0.173941  0.000000  0.0
4  0.0  0.000000  0.000000  0.173941  0.000000  0.0
5  0.0  0.000000  0.167583  0.000000  0.000000  0.0
6  0.0  0.000000  0.167583  0.000000  0.000000  0.0
7  0.0  0.085345  0.137421  0.000000  0.000000  0.0
8  0.0  0.000000  0.167583  0.000000  0.000000  0.0
9  0.0  0.104077  0.000000  0.000000  0.000000  0.0
(231, 6)


In [None]:
def get_similar_articles(q,t, df):
  print("Done Searching. Full Result: \n")
  print("searched items : ", q)
  print("Article with the Highest Cosine Similarity Values: ")
  search_rank ={}
  top_results=5
  q = [q]
  t = [t]

  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  q_vect = vectorizer.transform(t).toarray().reshape(df.shape[0],)
  sim = {}
  titl = {}

  for i in range(len(new_docs)) and range(len(titles)):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)  #Calculate the similarity
    # Or we can use cosine)similarity library both are the same
    titl[i] = np.dot(df.loc[:, i].values, q_vect) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vect)

  sim_sorted = sorted(sim.items(),key=lambda x : x[1], reverse=True)[:min(len(sim), top_results)]
  sim_sortedt = sorted(titl.items(),key=lambda x : x[1], reverse=True)[:min(len(titl), top_results)]


  for i, v in sim_sorted and sim_sortedt:    # Print the articles and their similarity values
    if v != 0.0:
      print("Similaritas score: ", v)
      zip(titles, new_docs)
      print(titles[i])
      print(new_docs[i])
      print('\n')

lemma_ops = 'ethiopia'
#q1 = 'electrical productions'
list1 = nltk.word_tokenize(lemma_ops)
q1 = ' '.join([lemmer.lemmatize(lemma_ops) for lemma_ops in list1])

get_similar_articles(q1,q1, df)
print('-'*100)

Done Searching. Full Result: 

searched items :  ethiopia
Article with the Highest Cosine Similarity Values: 
Similaritas score:  0.2673433484640173
National
The primary purpose of the dam ethiopia is electricity production to relieve Ethiopia’s acute energy shortage and for electricity export to neighboring         countries.ethiopia With a planned.


Similaritas score:  0.15996489348662396
Loved Turbine-Generators
also  sometimes  the same words can have multiple different ‘lemma’s. So  based on the context it’s used  you should identify the         part-of-speech (POS) tag for the word in that specific context and extract the appropriate lemma. Examples of implementing this comes         in the following sections countries.ethiopia With a planned.The name that the Blue Nile river loved took in Ethiopia is derived from the         Geez word for great to imply its being the river of rivers The word Abay still exists in ethiopia major languages


Similaritas score:  0.14582664099950898