#Libraries

In [None]:
!pip install wikipedia
!pip install wikipedia-api
!pip install ipywidgets

import pandas as pd
import numpy as np

# web scraping
import wikipedia
import wikipediaapi

# preprocessing
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# IRsystem
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

# cosine similarity
from math import sqrt
from sklearn.metrics.pairwise import cosine_similarity

# evaluation
from sklearn.metrics import ndcg_score

# interface
import ipywidgets as widgets
from IPython.display import display, clear_output

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=e433697e39e925589b8968aed9b36914746f8b8adcb10745b960f30f19e93ecd
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0
Collecting wikipedia-api
  Downloading Wikipedia_API-0.6.0-py3-none-any.whl.metadata (22 kB)
Downloading Wikipedia_API-0.6.0-py3-none-any.whl (14 kB)
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.6.0
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Using cached jedi-0.19.1-py2.py3-none

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


#Functions

In [None]:
greek_latin_letters = {
  r'α': 'alpha', r'β': 'beta', r'γ': 'gamma', r'Δ': 'delta',
  r'ε': 'epsilon', r'θ': 'theta', r'λ': 'lambda', r'μ': 'mu',
  r'π': 'pi', r'σ': 'sigma', r'τ': 'tau', r'υ': 'upsilon',
  r'ω': 'omega',
  r'Α': 'alpha', r'Β': 'beta', r'Γ': 'gamma', r'Δ': 'delta',
  r'Ε': 'epsilon', r'Θ': 'theta', r'Λ': 'lambda', r'Μ': 'mu',
  r'Π': 'pi', r'Σ': 'sigma', r'Τ': 'tau', r'Υ': 'upsilon',
  r'Ω': 'omega',
  r'č': 'c', r'š': 's', r'ž': 'z', r'é': 'e', r'è': 'e', r'á': 'a',
  r'à': 'a', r'ç': 'c', r'ú': 'u', r'ó': 'o', r'í': 'i', r'ñ': 'n',
  r'ü': 'u', r'ℓ': 'l',
  r'Č': 'c', r'Š': 's', r'Ž': 'z', r'É': 'e', r'È': 'e', r'Á': 'a',
  r'À': 'a', r'Ç': 'c', r'Ú': 'u', r'Ó': 'o', r'Í': 'i', r'Ñ': 'n',
  r'Ü': 'u', r'ℒ': 'l',
  r'â': 'a', r'ã': 'a', r'ä': 'a', r'Ä': 'a', r'æ': 'ae', r'Æ': 'ae',
  r'ê': 'e', r'ï': 'i', r'ò': 'o', r'ö': 'o', r"Ö": 'o',
  r'ā': 'a', r'ē': 'e', r'ī': 'i', r'ō': 'o',
  r'œ': "oe", r'Œ': 'oe', r'ή': 'e', r'ί': 'i', r'ζ': 'z',
  r'η': 'e', r'ι': 'i', r'κ': 'k', r'ν': 'n',
  r'ο': 'o', r'ρ': 'r', r'ς': 's', r'φ': 'f',
  r'ό': 'o', r'ύ': 'u', r'ḗ': 'e', r'ἶ': 'i',
  r'ῆ': 'e', r'ῷ': 'o', r'с': 's', r'С': 's'
}

In [None]:
def clean_text(text):
  # replace artificial intelligence and AIs with ai
  text = re.sub(r"[Aa]rtificial [Ii]ntelligence|\bAIs\b|\bais\b", "ai", text)
  # replace underscore/hyphen with space
  text = re.sub(r"[_\-—]+", " ", text)
  # remove 's if at end of word
  text = re.sub(r"('s)\b", "", text)
  # remove s/st/nd/rd/th if it's after a number
  text = re.sub(r"(?<=\d)(s|st|nd|rd|th)", r"", text)
  # add a space before a capital letter if it's in the middle of a word
  text = re.sub(r"(?<=\[a-z])([A-Z])", r" \1", text)
  # replace '²' and ^2 with squared
  text = re.sub(r"²|\^2| to the power of 2", " squared", text)
  # replace greek/latin letters to become their meaning/how a typical person writes them
  for character, replacement in greek_latin_letters.items():
      text = re.sub(character, replacement, text)
  return text

In [None]:
def to_sentence(text):
  # remove symbols (all characters other than letters/numbers/whitespaces/%/./!/?)
  text = re.sub(r"[^\w\s%\.!?]", "", text)
  # remove . if abreviation was before it (e.g.)
  text = re.sub(r"(?<=\.[A-Za-z])\.", "", text)
  # remove . unless a number or whitespace was after it
  text = re.sub(r"\.(?![\d\s])", "", text)
  return re.split(r"\.(?!\S)|[\n!?]", text)

In [None]:
def word_tokenize(sent):
  # remove symbols (all characters other than letters/numbers/whitespaces/%/.)
  sent = re.sub(r"[^\w\s%\.]", "", sent)
  # remove . unless a number was right after it
  sent = re.sub(r"\.(?!\d)", "", sent)
  return re.findall(r"\b\w+\b", sent)

In [None]:
stop_words = set(word_tokenize(clean_text(" ".join(stopwords.words('english')))))

In [None]:
def is_eng(word):
  # arabic
  if bool(re.match(r'^[\u0600-\u06FF\s]+$', word)):
    return False
  # chinese
  if bool(re.match(r'^[\u4E00-\u9FFF\s]+$', word)):
    return False
  return True

In [None]:
def make_lemmatized(sent):
  lemmatized_tokens = []
  lemmatizer = WordNetLemmatizer()
  for word in sent:
    lemmatized_token = lemmatizer.lemmatize(word, pos='v')
    lemmatized_token = lemmatizer.lemmatize(lemmatized_token, pos='n')
    lemmatized_token = lemmatizer.lemmatize(lemmatized_token, pos='a')
    lemmatized_tokens.append(lemmatized_token)
  return lemmatized_tokens

In [None]:
def preprocess(clean):
  words = word_tokenize(clean)
  lower = [word.lower() for word in words if word.lower() not in stop_words and is_eng(word.lower())]

  return " ".join(make_lemmatized(lower))

In [None]:
def preprocess_to_sent(clean):
  sents = to_sentence(clean)
  sents_tokens = [word_tokenize(sent) for sent in sents]
  sents_lower = []
  for i in range(len(sents_tokens)):
    lower_words = [word.lower() for word in sents_tokens[i] if word.lower() not in stop_words and is_eng(word.lower())]
    # check if sent is not empty/space (""/" ")
    if lower_words:
        sents_lower.append([word for word in lower_words]) #if word is not None])
  longest = 0
  for sent in sents_lower:
    if len(sent) > longest:
      longest = len(sent)
  sents_lem = [make_lemmatized(sent) for sent in sents_lower]

  return sents_lem, longest

In [None]:
# def analyze_text(text):
#   total_words = len(set(words))
#   unique_words = len(set(lemmatized_tokens))
#   stop_words_count = len(set(stop_words))
#   unique_words_percentage = f"{(unique_words / total_words) * 100:.2f}%"
#   stop_words_percentage = f"{(stop_words_count / total_words) * 100:.2f}%"

#   print("unique words pct:", unique_words_percentage)
#   print("stop words pct:", stop_words_percentage)

#   return total_words

In [None]:
def get_articles(titles):
  wiki_wiki = wikipediaapi.Wikipedia('Wikipedia_IRS', 'en')
  results = []
  longest_all = 0
  for title in titles:
    result = {}
    page = wiki_wiki.page(title)

    result["URL"] = page.fullurl
    result["Title"] = title
    result["Text"] = page.text
    result["Tokens"] = preprocess(clean_text(page.text))
    result["Tokens_list"], longest = preprocess_to_sent(clean_text(page.text))
    if longest > longest_all:
      longest_all = longest
    results.append(result)

  return pd.DataFrame(results), longest_all

In [None]:
def pad_sents(sents, longest):
  padded_sents = []
  for sent in sents:
    padded = sent.copy()
    while len(padded) < longest:
      padded.append("<oov>")
    padded_sents.append(padded)
  return padded_sents

In [None]:
def expand_query_with_synonyms(query):
    synonyms = set()
    for word in query.split():
        for synset in wordnet.synsets(word):
            synonyms.update(synset.lemma_names())
    return query + ' ' + ' '.join(synonyms)

In [None]:
def get_gains(scores):
  max = scores.iloc[0]["score"]
  min = scores.iloc[-1]["score"]

  r = np.linspace(min, max, num=5)
  scores["gain"] = 0
  for i in scores.index:
    if scores.loc[i,"score"] < r[1]:
      scores.loc[i,"gain"] = 0
    elif scores.loc[i,"score"] < r[2]:
      scores.loc[i,"gain"] = 1
    elif scores.loc[i,"score"] < r[3]:
      scores.loc[i,"gain"] = 2
    elif scores.loc[i,"score"] <= r[4]:
      scores.loc[i,"gain"] = 3
  return scores

In [None]:
def finalize_df(df):
  if df['docID'][0] == 0:
    additional_data = {'docID': range(20, 60),
                   'ideal_gain': [0] * 40}
    df2 = pd.DataFrame(additional_data)

    df = pd.concat([df, df2], ignore_index=True)

  elif df['docID'][0] == 20:
    additional_data = {'docID': range(0, 20),
                       'ideal_gain': [0] * 20}
    df2 = pd.DataFrame(additional_data)

    df = pd.concat([df, df2], ignore_index = True)

    additional_data = {'docID': range(40, 60),
                       'ideal_gain': [0] * 20}
    df2 = pd.DataFrame(additional_data)

    df = pd.concat([df, df2], ignore_index = True)

  else:
    additional_data = {'docID': range(0, 40),
                   'ideal_gain': [0] * 40}
    df2 = pd.DataFrame(additional_data)

    df = pd.concat([df, df2], ignore_index=True)

  return df.sort_values(by='docID', ascending=True).reset_index(drop=True)

In [None]:
def combine_df(expert_df, model_df):
    expert_df = expert_df.sort_index()
    model_df = model_df.sort_index()
    merged_df = pd.merge(expert_df, model_df, left_index=True, right_index=True, how='inner')
    return merged_df

#Creating DataFrame

In [None]:
titles_AI = wikipedia.search("Artificial intelligence", results = 20)
titles_DS = wikipedia.search("Data science", results = 20)
titles_DB = wikipedia.search("Database", results = 20)
titles = titles_AI + titles_DS + titles_DB

articles, longest_sent = get_articles(titles)

In [None]:
articles.head()

Unnamed: 0,URL,Title,Text,Tokens,Tokens_list
0,https://en.wikipedia.org/wiki/Artificial_intel...,Artificial intelligence,"Artificial intelligence (AI), in its broadest ...",ai ai broad sense intelligence exhibit machine...,"[[ai, ai, broad, sense, intelligence, exhibit,..."
1,https://en.wikipedia.org/wiki/Generative_artif...,Generative artificial intelligence,Generative artificial intelligence (generative...,generative ai generative ai genai gai ai capab...,"[[generative, ai, generative, ai, genai, gai, ..."
2,https://en.wikipedia.org/wiki/A.I._Artificial_...,A.I. Artificial Intelligence,A.I. Artificial Intelligence (or simply A.I.) ...,ai ai simply ai 2001 american science fiction ...,"[[ai, ai, simply, ai, 2001, american, science,..."
3,https://en.wikipedia.org/wiki/Artificial_gener...,Artificial general intelligence,Artificial general intelligence (AGI) is a typ...,artificial general intelligence agi type ai ai...,"[[artificial, general, intelligence, agi, type..."
4,https://en.wikipedia.org/wiki/Applications_of_...,Applications of artificial intelligence,Artificial intelligence (AI) has been used in ...,ai ai use application throughout industry acad...,"[[ai, ai, use, application, throughout, indust..."


#Text Summary

In [None]:
# AI_text = ""
# DS_text = ""
# DB_text = ""

# for t in df[:20]["Text"]:
#   AI_text += t
# for t in df[20:40]["Text"]:
#   DS_text += t
# for t in df[40:]["Text"]:
#   DB_text += t

In [None]:
# print("---------------- Artificial Intelligence ----------------")
# total = analyze_text_full(AI_text)
# print(" ")

# print("---------------- Data Science ----------------")
# total = analyze_text_full(DS_text)
# print(" ")

# print("---------------- Database ----------------")
# total = analyze_text_full(DB_text)
# print(" ")

#Index

In [None]:
# Figure out characters
char_unique = set()

for id, article in articles.iterrows():
  text = article["Tokens"]

  for char in text:
    char_unique.add(char)

# sorted(char_unique)

In [None]:
word_freq = {}
word_appearances = {}

for id, article in articles.iterrows():
  text = article["Tokens"]

  for word in word_tokenize(text):
    word_freq[word] = word_freq.get(word, 0) + 1
    if word not in word_appearances:
      word_appearances[word] = set()
    word_appearances[word].add(id)

word_freq = [(word, freq) for word, freq in word_freq.items()]
multi_index = pd.MultiIndex.from_tuples(word_freq, names=['Term', 'Frequency'])

wiki_index = pd.Series(list(word_appearances.values()), index=multi_index, name='Dictionary')
wiki_index = wiki_index.sort_index()

vocab = wiki_index.index.get_level_values(0).tolist()

wiki_index

Unnamed: 0_level_0,Unnamed: 1_level_0,Dictionary
Term,Frequency,Unnamed: 2_level_1
0,95,"{0, 2, 4, 5, 9, 11, 12, 15, 17, 18, 21, 22, 23..."
00,1,{12}
00023,1,{30}
0004,1,{12}
001,1,{25}
...,...,...
zoom,1,{35}
zoon,1,{32}
zurich,1,{18}
zuse,1,{5}


In [None]:
# # handle alphanumerics?
# for word in wiki_index.index:
#   if bool(re.match(r'\d+', word)):
#     print(word, wiki_index.loc[word]['Postings'])

#TFIDF

In [None]:
tfidf = TfidfVectorizer(vocabulary=vocab)
tfidf_matrix = tfidf.fit_transform(articles["Tokens"])

In [None]:
def tfidf_IRS(query, tfidf):
  query_expanded = expand_query_with_synonyms(query)
  query_clean = preprocess(clean_text(query_expanded)).split()

  # combine tfidf of query to make size (1, vocab)
  matrix = tfidf.transform(query_clean)
  combined = np.array(matrix.sum(axis=0))
  query_matrix = csr_matrix(combined)

  scores_tfidf = {i: 0 for i in range(tfidf_matrix.shape[0])}

  scores = [cosine_similarity(vector, query_matrix)[0][0] for vector in tfidf_matrix]

  scores_tfidf = pd.DataFrame({'score': scores})
  sorted_tfidf = scores_tfidf.sort_values(by='score', ascending = False)

  return get_gains(sorted_tfidf)

#Word2Vec

In [None]:
emb_size = 3

flattened = [sentence for article in articles["Tokens_list"] for sentence in article]
w2v_cbow = Word2Vec(sentences=flattened, vector_size=emb_size, window=5, min_count=1, sg=0)
w2v_skip = Word2Vec(sentences=flattened, vector_size=emb_size, window=5, min_count=1, sg=1)

articles["Embeddings_cbow"] = articles["Tokens_list"].apply(lambda article: pad_sents(article, longest_sent))
articles["Embeddings_cbow"] = articles["Embeddings_cbow"].apply(lambda article: np.array([[w2v_cbow.wv[word] if word in w2v_cbow.wv else np.zeros(emb_size) for word in sent] for sent in article]))

articles["Embeddings_skip"] = articles["Tokens_list"].apply(lambda article: pad_sents(article, longest_sent))
articles["Embeddings_skip"] = articles["Embeddings_skip"].apply(lambda article: np.array([[w2v_skip.wv[word] if word in w2v_skip.wv else np.zeros(emb_size) for word in sent] for sent in article]))

In [None]:
def w2v_IRS(embeddings, query, w2v, longest, emb):
  query_expanded = expand_query_with_synonyms(query)
  query_clean = preprocess(clean_text(query_expanded)).split()

  # pad query
  query_padded = query_clean.copy()
  while len(query_padded) < longest_sent:
      query_padded.append("<oov>")

  # replace with embedding
  query_vector = np.array([w2v.wv[word] if word in w2v.wv else np.zeros(emb) for word in query_padded])

  scores_w2v = {i: 0 for i in range(embeddings.shape[0])}

  for id in range(embeddings.shape[0]):
    scores = []
    for vector in embeddings[id]:
      similarity = cosine_similarity(vector, query_vector)[0][0]
      scores.append(similarity)
    average_similarity_score = sum(scores) / len(scores)
    scores_w2v[id] = average_similarity_score

  scores_w2v = pd.DataFrame({'score': scores_w2v})
  sorted_w2v = scores_w2v.sort_values(by='score', ascending = False)

  return get_gains(sorted_w2v)

#Query

#Cosine Similarity

In [None]:
# scores_tfidf = {i: 0 for i in range(60)}

# scores = [cosine_similarity(vector, query_matrix)[0][0] for vector in tfidf_matrix]

# scores_tfidf = pd.DataFrame({'score': scores})
# sorted_tfidf = scores_tfidf.sort_values(by='score', ascending = False)

# sorted_tfidf = get_gains(sorted_tfidf)
# print(sorted_tfidf[:10])

In [None]:
# scores_w2v = {i: 0 for i in range(60)}
# # import sys

# # np.concatenate(np.vstack(df["Embeddings"][42][1]))

# # Set printing options
# # np.set_printoptions(precision=4, suppress=True, threshold=sys.maxsize)

# for id in range(articles.shape[0]):
# # for id in range(2,3):
#   scores = []
#   for vector in articles["Embeddings"][id]:
#     # vector = np.concatenate(np.vstack(sent))

#     padded_query_vector_tok = query_vector_tok.copy()
#     padded_vector = vector.copy()

#     if len(vector) > len(query_vector_tok):
#         padded_query_vector_tok = np.pad(query_vector_tok, (0, len(vector) - len(query_vector_tok)), mode='constant')
#     elif len(query_vector_tok) > len(vector):
#         padded_vector = np.pad(vector, (0, len(query_vector_tok) - len(vector)), mode='constant')
#     print(len(padded_vector)/100, len(padded_query_vector_tok)/100)
#     if (np.isnan(padded_query_vector_tok).any()):
#       print(True)
#     if(np.isnan(padded_vector).any()):
#       print(True)
#     norm_vector = np.linalg.norm(vector)

#     Handle zero vectors
#     if norm_query == 0 or norm_vector == 0:
#         scores.append(0)
#         continue
#     unit_vector = vector / norm_vector
#     if np.isnan(padded_vector).any():
#       print(True)
#     print(padded_vector)

#     similarity = cosine_similarity(vector, query_vector_tok)[0][0]
#     scores.append(similarity)
#   #   # print(len(vector)/100)
#   #   # print(vector[[-1]])
#   #   # print(similarity)
#   average_similarity_score = sum(scores) / len(scores)
#   scores_w2v[id] = average_similarity_score

#   # # print(len(vector))

# scores_w2v = pd.DataFrame({'score': scores_w2v})
# sorted_w2v = scores_w2v.sort_values(by='score', ascending = False)
# # sorted
# # sorted = scores_df.sort_values(ascending = False)

In [None]:
# scores_w2v = {i: 0 for i in range(60)}

# for id in range(articles.shape[0]):
#   scores = []
#   for vector in articles["Embeddings"][id]:
#     similarity = cosine_similarity(vector, query_vector)[0][0]
#     scores.append(similarity)
#   average_similarity_score = sum(scores) / len(scores)
#   scores_w2v[id] = average_similarity_score

# scores_w2v = pd.DataFrame({'score': scores_w2v})
# sorted_w2v = scores_w2v.sort_values(by='score', ascending = False)

# sorted_w2v = get_gains(sorted_w2v)
# print(sorted_w2v[:10])

#Evaluation

In [None]:
eval_word2vec = []
eval_tfidf = []

In [None]:
my_query = "what is artificial intelligence?"

data = {'docID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
      'ideal_gain': [3, 2, 2, 3, 0, 2, 3, 1, 2, 2, 1, 0, 0, 2, 2, 2, 1, 1, 2, 0]}

df1 = pd.DataFrame(data)

df_query1 = finalize_df(df1)
df_word2vec_query1 = w2v_IRS(articles["Embeddings_cbow"], my_query, w2v_cbow, longest_sent, emb_size)
df_word2vec_query1 = df_word2vec_query1.sort_index()

combined = combine_df(df_query1, df_word2vec_query1)
combined = combined.sort_values(by = 'ideal_gain', ascending = False)

ideal_gain = combined['ideal_gain'].tolist()
gain = combined['gain'].tolist()

y_true = [ideal_gain]
y_score = [gain]

k_val = ideal_gain.index(0)

eval = ndcg_score(y_true, y_score, k = k_val)
eval_word2vec.append(eval)

print("NDCG score (WORD2VEC): ", eval)
print("\n")

df_query1 = finalize_df(df1)
df_tfidf_query1 = tfidf_IRS(my_query, tfidf)
df_tfidf_query1 = df_tfidf_query1.sort_index()

combined = combine_df(df_query1, df_tfidf_query1)
combined = combined.sort_values(by = 'ideal_gain', ascending = False)

ideal_gain = combined['ideal_gain'].tolist()
gain = combined['gain'].tolist()

y_true = [ideal_gain]
y_score = [gain]

k_val = ideal_gain.index(0)

eval = ndcg_score(y_true, y_score, k = k_val)
eval_tfidf.append(eval)

print("NDCG score (TFIDF): ", eval)

NDCG score (WORD2VEC):  0.2748454742404674


NDCG score (TFIDF):  0.7761080040656038


In [None]:
my_query = "what is data science?"

data = {'docID': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
      'ideal_gain': [3, 1, 2, 0, 1, 2, 2, 2, 2, 0, 1, 2, 2, 0, 1, 1, 2, 1, 2, 2]}

df2 = pd.DataFrame(data)

df_query2 = finalize_df(df2)
df_word2vec_query2 = w2v_IRS(articles["Embeddings_cbow"], my_query, w2v_cbow, longest_sent, emb_size)
df_word2vec_query2 = df_word2vec_query2.sort_index()

combined = combine_df(df_query2, df_word2vec_query2)
combined = combined.sort_values(by = 'ideal_gain', ascending = False)

ideal_gain = combined['ideal_gain'].tolist()
gain = combined['gain'].tolist()

y_true = [ideal_gain]
y_score = [gain]

k_val = ideal_gain.index(0)

eval = ndcg_score(y_true, y_score, k = k_val)
eval_word2vec.append(eval)

print("NDCG score (WORD2VEC): ", eval)
print("\n")

df_query2 = finalize_df(df2)
df_tfidf_query2 = tfidf_IRS(my_query, tfidf)
df_tfidf_query2 = df_tfidf_query2.sort_index()

combined = combine_df(df_query2, df_tfidf_query2)
combined = combined.sort_values(by = 'ideal_gain', ascending = False)

ideal_gain = combined['ideal_gain'].tolist()
gain = combined['gain'].tolist()

y_true = [ideal_gain]
y_score = [gain]

k_val = ideal_gain.index(0)

eval = ndcg_score(y_true, y_score, k = k_val)
eval_tfidf.append(eval)

print("NDCG score (TFIDF): ", eval)

NDCG score (WORD2VEC):  0.3028125992806813


NDCG score (TFIDF):  0.7523672216429602


In [None]:
my_query = "what is a database?"

data = {'docID': [40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
        'ideal_gain': [3, 0, 3, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 1, 2, 1, 3, 1, 3, 2]}

df3 = pd.DataFrame(data)

df_query3 = finalize_df(df3)
df_word2vec_query3 = w2v_IRS(articles["Embeddings_cbow"], my_query, w2v_cbow, longest_sent, emb_size)
df_word2vec_query3 = df_word2vec_query3.sort_index()

combined = combine_df(df_query3, df_word2vec_query3)
combined = combined.sort_values(by = 'ideal_gain', ascending = False)

ideal_gain = combined['ideal_gain'].tolist()
gain = combined['gain'].tolist()

y_true = [ideal_gain]
y_score = [gain]

k_val = ideal_gain.index(0)

eval = ndcg_score(y_true, y_score, k = k_val)
eval_word2vec.append(eval)

print("NDCG score (WORD2VEC): ", eval)
print("\n")

df_query3 = finalize_df(df3)
df_tfidf_query3 = tfidf_IRS(my_query, tfidf)
df_tfidf_query3 = df_tfidf_query3.sort_index()

combined = combine_df(df_query3, df_tfidf_query3)
combined = combined.sort_values(by = 'ideal_gain', ascending = False)

ideal_gain = combined['ideal_gain'].tolist()
gain = combined['gain'].tolist()

y_true = [ideal_gain]
y_score = [gain]

k_val = ideal_gain.index(0)

eval = ndcg_score(y_true, y_score, k = k_val)
eval_tfidf.append(eval)

print("NDCG score (TFIDF): ", eval)

NDCG score (WORD2VEC):  0.4159752893739773


NDCG score (TFIDF):  0.732952443075378


In [None]:
my_query = "what is machine learning?"

docID = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
Rank = [2, 0, 0, 3, 0, 0, 1, 0, 2, 3, 0, 0, 1, 2, 1, 3, 1, 0, 3, 1]

df4 = pd.DataFrame({'docID': docID, 'ideal_gain': Rank})

df_query4 = finalize_df(df4)
df_word2vec_query4 = w2v_IRS(articles["Embeddings_cbow"], my_query, w2v_cbow, longest_sent, emb_size)
df_word2vec_query4 = df_word2vec_query4.sort_index()

combined = combine_df(df_query4, df_word2vec_query4)
combined = combined.sort_values(by = 'ideal_gain', ascending = False)

ideal_gain = combined['ideal_gain'].tolist()
gain = combined['gain'].tolist()

y_true = [ideal_gain]
y_score = [gain]

k_val = ideal_gain.index(0)

eval = ndcg_score(y_true, y_score, k = k_val)
eval_word2vec.append(eval)

print("NDCG score (WORD2VEC): ", eval)
print("\n")

df_query4 = finalize_df(df4)
df_tfidf_query4 = tfidf_IRS(my_query, tfidf)
df_tfidf_query4 = df_tfidf_query4.sort_index()

combined = combine_df(df_query4, df_tfidf_query4)
combined = combined.sort_values(by = 'ideal_gain', ascending = False)

ideal_gain = combined['ideal_gain'].tolist()
gain = combined['gain'].tolist()

y_true = [ideal_gain]
y_score = [gain]

k_val = ideal_gain.index(0)

eval = ndcg_score(y_true, y_score, k = k_val)
eval_tfidf.append(eval)

print("NDCG score (TFIDF): ", eval)

NDCG score (WORD2VEC):  0.2049956614487451


NDCG score (TFIDF):  0.6369318083805976


In [None]:
my_query = "data storage?"

data = {'docID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,  21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
        'ideal_gain': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 3, 0, 2, 0, 0, 3, 2, 1, 3, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 3, 0, 3, 1, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 1]}

df_query5 = pd.DataFrame(data)

df_word2vec_query5 = w2v_IRS(articles["Embeddings_cbow"], my_query, w2v_cbow, longest_sent, emb_size)
df_word2vec_query5 = df_word2vec_query5.sort_index()

combined = combine_df(df_query5, df_word2vec_query5)
combined = combined.sort_values(by = 'ideal_gain', ascending = False)

ideal_gain = combined['ideal_gain'].tolist()
gain = combined['gain'].tolist()

y_true = [ideal_gain]
y_score = [gain]

k_val = ideal_gain.index(0)

eval = ndcg_score(y_true, y_score, k = k_val)
eval_word2vec.append(eval)

print("NDCG score (WORD2VEC): ", eval)
print("\n")

df_query5 = pd.DataFrame(data)
df_tfidf_query5 = tfidf_IRS(my_query, tfidf)
df_tfidf_query5 = df_tfidf_query5.sort_index()

combined = combine_df(df_query5, df_tfidf_query5)
combined = combined.sort_values(by = 'ideal_gain', ascending = False)

ideal_gain = combined['ideal_gain'].tolist()
gain = combined['gain'].tolist()

y_true = [ideal_gain]
y_score = [gain]

k_val = ideal_gain.index(0)

eval = ndcg_score(y_true, y_score, k = k_val)
eval_tfidf.append(eval)

print("NDCG score (TFIDF): ", eval)

NDCG score (WORD2VEC):  0.33927626990687654


NDCG score (TFIDF):  0.4851628934216609


In [None]:
my_query = "database management"

data = {'docID': [40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
        'ideal_gain': [1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 3]}

df6 = pd.DataFrame(data)

df_query6 = finalize_df(df6)
df_word2vec_query6 = w2v_IRS(articles["Embeddings_cbow"], my_query, w2v_cbow, longest_sent, emb_size)
df_word2vec_query6 = df_word2vec_query6.sort_index()

combined = combine_df(df_query6, df_word2vec_query6)
combined = combined.sort_values(by = 'ideal_gain', ascending = False)

ideal_gain = combined['ideal_gain'].tolist()
gain = combined['gain'].tolist()

y_true = [ideal_gain]
y_score = [gain]

k_val = ideal_gain.index(0)

eval = ndcg_score(y_true, y_score, k = k_val)
eval_word2vec.append(eval)

print("NDCG score (WORD2VEC): ", eval)
print("\n")

df_query6 = finalize_df(df6)
df_tfidf_query6 = tfidf_IRS(my_query, tfidf)
df_tfidf_query6 = df_tfidf_query6.sort_index()

combined = combine_df(df_query6, df_tfidf_query6)
combined = combined.sort_values(by = 'ideal_gain', ascending = False)

ideal_gain = combined['ideal_gain'].tolist()
gain = combined['gain'].tolist()

y_true = [ideal_gain]
y_score = [gain]

k_val = ideal_gain.index(0)

eval = ndcg_score(y_true, y_score, k = k_val)
eval_tfidf.append(eval)

print("NDCG score (TFIDF): ", eval)

NDCG score (WORD2VEC):  0.15817712631939626


NDCG score (TFIDF):  0.28496255468093407


#Evaluation Values

In [None]:
print(eval_word2vec)
word2vec_e = sum(eval_word2vec) / len(eval_word2vec)
print('\n')
print(word2vec_e)

[0.2748454742404674, 0.3028125992806813, 0.4159752893739773, 0.2049956614487451, 0.33927626990687654, 0.15817712631939626]


0.2826804034283573


In [None]:
print(eval_tfidf)
tfidf_e = sum(eval_tfidf) / len(eval_tfidf)
print('\n')
print(tfidf_e)

[0.7761080040656038, 0.7523672216429602, 0.732952443075378, 0.6369318083805976, 0.4851628934216609, 0.28496255468093407]


0.6114141542111892


#Interface

In [None]:
text_input = widgets.Text(description="Enter query:", value='')

button = widgets.Button(description='Generate Results')

amount = 10

def on_button_click(b, articles_df, amount):
  query = text_input.value

  result_tfidf = tfidf_IRS(query, tfidf)
  # result_w2v = w2v_IRS(articles["Embeddings_cbow"], query, w2v_cbow, longest_sent, emb_size)
  # result = w2v_IRS(articles["Embeddings_skip"], query, w2v_skip, longest_sent, emb_size)

  clear_output()
  display(text_input, button)
  for i, id in zip(range(1, amount+1), result_tfidf[:amount].index):
    print(str(i) + '. ' + articles_df['Title'][id] + ' \nURL: ' + articles_df['URL'][id])

button.on_click(lambda b: on_button_click(b, articles, amount))

display(text_input, button)

Text(value='AI career options', description='Enter query:')

Button(description='Generate Results', style=ButtonStyle())

1. Artificial intelligence 
URL: https://en.wikipedia.org/wiki/Artificial_intelligence
2. Regulation of artificial intelligence 
URL: https://en.wikipedia.org/wiki/Regulation_of_artificial_intelligence
3. Ethics of artificial intelligence 
URL: https://en.wikipedia.org/wiki/Ethics_of_artificial_intelligence
4. Applications of artificial intelligence 
URL: https://en.wikipedia.org/wiki/Applications_of_artificial_intelligence
5. Artificial general intelligence 
URL: https://en.wikipedia.org/wiki/Artificial_general_intelligence
6. Generative artificial intelligence 
URL: https://en.wikipedia.org/wiki/Generative_artificial_intelligence
7. History of artificial intelligence 
URL: https://en.wikipedia.org/wiki/History_of_artificial_intelligence
8. Artificial Intelligence Act 
URL: https://en.wikipedia.org/wiki/Artificial_Intelligence_Act
9. Artificial intelligence in healthcare 
URL: https://en.wikipedia.org/wiki/Artificial_intelligence_in_healthcare
10. Artificial intelligence art 
URL: htt