In [2]:
import nltk
import re
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import string
import random
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt')

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mobilehouse/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mobilehouse/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def scrape_wiki(keyword, n=20):
  url = f'https://en.wikipedia.org/w/index.php?search={keyword}&title=Special%3ASearch&profile=advanced&fulltext=1&ns0=1'
  articales = []
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')

  articles = soup.find_all('div', class_='mw-search-result-heading')

  for i, article in enumerate(articles[:n]):
      title = article.find('a').text
      link = 'https://en.wikipedia.org' + article.find('a')['href']

      article_response = requests.get(link)
      article_soup = BeautifulSoup(article_response.text, 'html.parser')
      pars = article_soup.find_all('p')

      content = ''
      for i in pars:
        content = content +" "+ i.text

      articale = {'title':title , 'link':link , 'content':content}
      articales.append(articale)

  return articales

def get_content(articales):
  return ' '.join([x['content'] for x in articales])

def clean_text(text):
    cleaned_text = text.lower()
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    cleaned_text = re.sub(r'[^a-z0-9\s]', '', cleaned_text)
    cleaned_text = re.sub(r'[()\[\]{}]', '', cleaned_text)
    exclude = set(string.punctuation)
    cleaned_text = ''.join(ch for ch in cleaned_text if ch not in exclude)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

def get_vocabs(text):
    text = clean_text(text)
    words =  word_tokenize(text.lower())
    unique_words = set(words)
    stop_words = set(stopwords.words('english'))
    stop_words_text = [x for x in words if x in stop_words ]
    return unique_words , stop_words_text , words

def show_info(topic , articales):
  content = get_content(articales)
  vocabs , stopwords , cleaned_content = get_vocabs(content)

  unique_vocabs_per = (len(vocabs)/len(cleaned_content))*100
  stop_words_per = (len(stopwords)/len(cleaned_content))*100

  print(topic)
  print()
  print('total number of articles : ', len(articales))
  print('percentage of unique words:',round(unique_vocabs_per , 2) , "%")
  print('percentage of stop_words:',round(stop_words_per ,2) , "%")
  print('{0}'.format('='*50))


In [4]:
artificial_Intelligence_articales  = scrape_wiki('Artificial_Intelligence', n=20)
machine_Learning_articales  = scrape_wiki('Machine_Learning', n=20)
data_Science_articales  = scrape_wiki('Data_Science', n=20)

In [5]:
print('overall articals {0}'.format(60))
print('{0}'.format('='*50))

show_info('Artificial Intelligence' , artificial_Intelligence_articales )
show_info('Machine Learning' , machine_Learning_articales )
show_info('Data Science' , data_Science_articales )

overall articals 60
Artificial Intelligence

total number of articles :  20
percentage of unique words: 11.75 %
percentage of stop_words: 39.64 %
Machine Learning

total number of articles :  20
percentage of unique words: 11.7 %
percentage of stop_words: 38.49 %
Data Science

total number of articles :  20
percentage of unique words: 15.18 %
percentage of stop_words: 38.86 %


In [6]:
all_articales = artificial_Intelligence_articales +  machine_Learning_articales + data_Science_articales

In [7]:
glove_model = api.load('glove-wiki-gigaword-300')

In [8]:
all_doc = []
documents = {}
for i in all_articales:
    documents[i['title']] = {'content': clean_text(i['content']).split(" "), 'link': i['link']}
    all_doc.append(clean_text(i['content']).split(" "))


all_doc = [' '.join(i) for i in all_doc]

In [9]:
vectorizer = TfidfVectorizer()
vectorizer.fit(all_doc)

In [10]:
import numpy as np
def glove_embbed(text):
    embedd = []
    unk_tokens_count = []
    for token in text:
        try:
            embedd.append(glove_model[token])
        except:
            #will go for except if token is not in Glove corpus
            unk_tokens_count.append(token)
            embedd.append(glove_model['unk'])

    return np.array(embedd),unk_tokens_count

embedding_index = {}
total_unk_tokens = []
tf_idf_embed = {}

for document in tqdm(list(documents.keys())):


    doc_embed = vectorizer.transform(documents[document]['content'])
    tf_idf_embed[document] = doc_embed
    doc_embed , unk_tokens_doc = glove_embbed(documents[document]['content'])
    total_unk_tokens.extend(unk_tokens_doc)
    embedding_index[document] = doc_embed



100%|██████████| 59/59 [00:00<00:00, 173.65it/s]


In [11]:
#100*(len(total_unk_tokens) / len([item for row in list(documents.values()) for item in row]))

In [12]:
'''
results_glove = {}
resluts_tfidf = {}

query = 'generative ai'

querey_cleaned = clean_text(query).split(" ")
query_embeds_glove = glove_embbed(querey_cleaned)[0]
query_embeds_tfidf = vectorizer.transform(querey_cleaned)

for articale in tqdm(list(embedding_index.keys())):
  similarity_score_glove = cosine_similarity(embedding_index[articale], query_embeds_glove)
  similarity_score_tfidf = cosine_similarity(tf_idf_embed[articale], query_embeds_tfidf)
    
  results_glove[articale] = similarity_score_glove.flatten().mean()
  resluts_tfidf[articale] = similarity_score_tfidf.flatten().mean()
  
    
results_glove = dict(sorted(results_glove.items(), key=lambda item: item[1] , reverse=True))
resluts_tfidf = dict(sorted(resluts_tfidf.items(), key=lambda item: item[1] , reverse=True))

results_weghiteds_sum = {}
for i in list(documents.keys()):
    result = results_glove[i] * 0.7 + resluts_tfidf[i] * 0.3
    results_weghiteds_sum[i] = result
    
results_weghiteds_sum = dict(sorted(results_weghiteds_sum.items(), key=lambda item: item[1] , reverse=True))
results_weghiteds_sum
'''

'\nresults_glove = {}\nresluts_tfidf = {}\n\nquery = \'generative ai\'\n\nquerey_cleaned = clean_text(query).split(" ")\nquery_embeds_glove = glove_embbed(querey_cleaned)[0]\nquery_embeds_tfidf = vectorizer.transform(querey_cleaned)\n\nfor articale in tqdm(list(embedding_index.keys())):\n  similarity_score_glove = cosine_similarity(embedding_index[articale], query_embeds_glove)\n  similarity_score_tfidf = cosine_similarity(tf_idf_embed[articale], query_embeds_tfidf)\n    \n  results_glove[articale] = similarity_score_glove.flatten().mean()\n  resluts_tfidf[articale] = similarity_score_tfidf.flatten().mean()\n  \n    \nresults_glove = dict(sorted(results_glove.items(), key=lambda item: item[1] , reverse=True))\nresluts_tfidf = dict(sorted(resluts_tfidf.items(), key=lambda item: item[1] , reverse=True))\n\nresults_weghiteds_sum = {}\nfor i in list(documents.keys()):\n    result = results_glove[i] * 0.7 + resluts_tfidf[i] * 0.3\n    results_weghiteds_sum[i] = result\n    \nresults_weghite

In [31]:
import tkinter as tk
import webbrowser

def clean_text(text):
    cleaned_text = text.lower()
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    cleaned_text = re.sub(r'[^a-z0-9\s]', '', cleaned_text)
    cleaned_text = re.sub(r'[()\[\]{}]', '', cleaned_text)
    exclude = set(string.punctuation)
    cleaned_text = ''.join(ch for ch in cleaned_text if ch not in exclude)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

def open_link(event):
    widget = event.widget
    index = widget.index(tk.CURRENT)
    hyperlink = widget.get(f"{index} linestart", f"{index} lineend")
    webbrowser.open(hyperlink)

def search():
    results_glove = {}
    resluts_tfidf = {}
    query = entry.get()
    querey_cleaned = clean_text(query).split(" ")
    query_embeds_glove = glove_embbed(querey_cleaned)[0]
    query_embeds_tfidf = vectorizer.transform(querey_cleaned)
    
    for articale in tqdm(list(embedding_index.keys())):
      similarity_score_glove = cosine_similarity(embedding_index[articale], query_embeds_glove)
      similarity_score_tfidf = cosine_similarity(tf_idf_embed[articale], query_embeds_tfidf)
        
      results_glove[articale] = similarity_score_glove.flatten().mean()
      resluts_tfidf[articale] = similarity_score_tfidf.flatten().mean()
      
        
    results_glove = dict(sorted(results_glove.items(), key=lambda item: item[1] , reverse=True))
    resluts_tfidf = dict(sorted(resluts_tfidf.items(), key=lambda item: item[1] , reverse=True))
    
    results_weghiteds_sum = {}
    for i in list(documents.keys()):
        result = (results_glove[i] + resluts_tfidf[i] ) /2 
        results_weghiteds_sum[i] = result
        
    top_10_results = sorted(results_weghiteds_sum.items(), key=lambda item: item[1], reverse=True)[:10]   
 
    result_text.delete(1.0, tk.END)
    
    if top_10_results[0][1] < 0.1:
        result_text.insert(tk.END , "No Results")
        print(top_10_results[0][1])
    else:
        for result in top_10_results:
            title = result[0]
            similarity_score = result[1]
            hyperlink = documents[title]['link']
            result_text.insert(tk.END, f"Title: {title}\nSimilarity Score: {similarity_score}\n")
            # Create a clickable hyperlink
            result_text.insert(tk.END, f"{hyperlink}\n", "hyperlink")
            result_text.tag_bind("hyperlink", "<Button-1>", open_link)
            result_text.insert(tk.END, "\n\n")

# Create the main window
root = tk.Tk()
root.title("Article Search")

# Styling
root.configure(bg='#f0f0f0')
root.geometry('600x400')

# Create a label and entry for the search query
label = tk.Label(root, text=" Enter your query:    ", font=('Arial', 12), bg='#f0f0f0')
label.pack(pady=10)
entry = tk.Entry(root, width=50, font=('Arial', 12))
entry.pack(pady=5)

# Create a search button
search_button = tk.Button(root, text="Search", command=search, font=('Arial', 12), bg='#4caf50', fg='white')
search_button.pack(pady=10)

# Create a text widget to display search results
result_text = tk.Text(root, wrap=tk.WORD, width=80, height=20, font=('Arial', 12))
result_text.pack(pady=10)

# Start the GUI event loop
root.mainloop()


100%|██████████| 59/59 [00:00<00:00, 500.26it/s]
100%|██████████| 59/59 [00:00<00:00, 534.57it/s]


-0.007452624849975109


100%|██████████| 59/59 [00:00<00:00, 484.48it/s]


0.08871650067182629


100%|██████████| 59/59 [00:00<00:00, 568.55it/s]


0.08104007675995889


100%|██████████| 59/59 [00:00<00:00, 556.22it/s]


0.025658514350652695


100%|██████████| 59/59 [00:00<00:00, 521.15it/s]


0.025658514350652695


100%|██████████| 59/59 [00:00<00:00, 542.30it/s]
100%|██████████| 59/59 [00:00<00:00, 568.89it/s]


0.025658514350652695


100%|██████████| 59/59 [00:00<00:00, 752.71it/s]


0.025658514350652695


100%|██████████| 59/59 [00:00<00:00, 717.54it/s]


0.025658514350652695
