In [1]:
from preprocess import prepare
import pandas as pd
import os

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\modar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\modar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\modar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def read_file(path):
    with open(path,"r",encoding="utf-8") as f:
        lines=f.readlines()
    doc=[line for line in lines if line!="\n"]
    doc=" ".join(doc)
    return doc

In [3]:
def save_inDic(text,filename,category):
    dic={"address": str(f"{filename}"), "html": text, "hash_code": str(f"{filename}")}
    return dic

In [4]:
#passing the texts to the prepare method to modify them for the search
def prepare_files(path,filename,category,df):
    doc=read_file(path)
    dic=save_inDic(doc,filename,category)
    prepare(dic,df)

In [7]:
df = pd.DataFrame(columns=["address","html","hash_code","embedding_index","mean_vector"])
if not os.path.isfile("back_up/modified_data_redundant.csv"):
    df.to_csv("back_up/modified_data_redundant.csv",index=False)
df=pd.read_csv("back_up/modified_data_redundant.csv")
len(df)

0

In [8]:
#reading the texts from the BBC News dataset
categories=["business","entertainment","politics","sport","tech"]
for category in categories:
    folder_path= f"bbc-fulltext/bbc/{category}"
    for filename in os.listdir(folder_path):
        path = f"bbc-fulltext/bbc/{category}/{filename}"
        prepare_files(path,filename,category,df)
        print(f"Processing file: {category} {filename}    ", end="\r")
    df.to_csv("back_up/modified_data_redundant.csv",index=False)

Processing file: tech 401.txt        t    

In [40]:
df_wiki = pd.DataFrame(columns=["address","html","hash_code","embedding_index","mean_vector"])
if not os.path.isfile("back_up/modified_data_wiki.csv"):
    df_wiki.to_csv("back_up/modified_data_wiki.csv",index=False)
df_wiki=pd.read_csv("back_up/modified_data_wiki.csv")
len(df_wiki)

0

In [41]:
#read the texts from the wikipedia dataset
folder_path= f"back_up/wikis"
for filename in sorted(os.listdir(folder_path),key=lambda x: int(x.split(".")[0])):
    path = f"back_up/wikis/{filename}"
    prepare_files(path,filename,"category",df_wiki)
    print(f"Processing file: {filename} ", end="\r")
df_wiki.to_csv("back_up/modified_data_wiki.csv",index=False)

Processing file: 150.txt 

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import re
from nltk.corpus import stopwords 
import nltk
import pandas as pd
import os

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\modar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
def clean_text(sent):
    cleaned_sentence = re.sub(r'\b\d+\b', '', sent)
    tokens = sent.split()
    stop_words = set(stopwords.words("english"))
    no_stop_words = [token for token in tokens
                      if token not in stop_words
                        and token.isalpha()]
    cleaned_sentence = " ".join(no_stop_words)
    english_letters_pattern = re.compile(r'[a-zA-Z]+')
    english_letters = english_letters_pattern.findall(cleaned_sentence)
    cleaned_text = ' '.join(english_letters)
    return cleaned_text

In [10]:
#generates a list of sentences from the passed text
def extract_sents(doc):
    #a=[e.split(".") for e  in  doc]
    a=[nltk.sent_tokenize(e) for e in doc]
    flattened_list = [item for sublist in a for item in sublist if item!="\n"]
    cleaned_doc= [clean_text(sentence) for sentence in flattened_list if sentence!=""]
    return [sent for sent in cleaned_doc if sent!=""]

In [15]:
# v=TfidfVectorizer()
# transformed=v.fit_transform(extract_sents(["US and UK, raising millions of dollars for African famine relief. The re-release also marks the 20th anniversary of the original recording"]))
# vectors= transformed.toarray()
# v.vocabulary_

{'us': 11,
 'raising': 8,
 'millions': 6,
 'dollars': 3,
 'african': 0,
 'famine': 4,
 'the': 10,
 'also': 1,
 'marks': 5,
 'anniversary': 2,
 'original': 7,
 'recording': 9}

In [16]:
#extract_sents(["US and UK, raising  millions of dollars for African famine relief. The re-release also marks the 20th anniversary of the original recording"])

['US raising millions dollars African famine',
 'The also marks anniversary original recording']

In [17]:
#read the text file from a given path and save it to  list of lines
def read_for_query(path):
    with open(path,"r", encoding="utf-8") as f:
        lines=f.readlines()
    doc=[line for line in lines if line!="\n"]
    return doc

In [18]:
#Apply TF-IDF on a given document and extract cndidates for the query
def get_query_samples(doc):
    v=TfidfVectorizer()
    transformed=v.fit_transform(doc)
    vectors= transformed.toarray()
    mean=[]
    keys=list(v.vocabulary_.keys())
    for word in keys:
        values=[vectors[i][v.vocabulary_.get(word)]
         for i in range(len(vectors))]
        mean.append((word,sum(values)/len(values)))

    mean=sorted(mean,key= lambda x:x[1],reverse=True)
    query_samples=[mean[r][0] for r in random.sample(range(101), 100)]
    return query_samples

In [19]:
# def save_to_csv(query,filename,category,df):
#     document=str(f"{category}{filename}")
#     df.loc[len(df)]=[document,query]

In [None]:
#generates query for a given document for bbc News dataset
def generate_queries_for_BBC(category,filename,df):
    path = f"bbc-fulltext/bbc/{category}/{filename}"
    

    file=f"{filename}"
    if file in list(df["document"]):
        return

    doc=read_for_query(path)
    sents=extract_sents(doc)
    samples=[word for word in get_query_samples(sents)]
    #print(f"Processing file: {filename}    ", end="\r")
    df.loc[len(df)]= [f"{filename}"," ".join(samples)]
    df.at[len(df)-1,"document"]=f"{category}{filename}"
    df.at[len(df)-1,"query"]=" ".join(samples)
    # q=" ".join(samples)
    # print(f"query: {q}    ", end="\r")

In [20]:
#generates query for a given document for wikipedia dataset
def generate_queries_for_wiki(filename,df):
    path = f"back_up/wikis/{filename}"
    

    file=f"{filename}"
    if file in list(df["document"]):
        return

    doc=read_for_query(path)
    sents=extract_sents(doc)
    samples=[word for word in get_query_samples(sents)]
    #print(f"Processing file: {filename}    ", end="\r")
    df.loc[len(df)]= [f"{filename}"," ".join(samples)]
    # df.at[len(df)-1,"document"]=f"{category}{filename}"
    # df.at[len(df)-1,"query"]=" ".join(samples)
    #q=" ".join(samples)
    #print(f"query: {q}    ", end="\r")

In [121]:
df=pd.DataFrame(columns=["document","query"])
if not os.path.isfile("back_up/queries_wiki.csv"):
    df.to_csv("back_up/queries_wiki.csv",index=False)
df=pd.read_csv("back_up/queries_wiki.csv")

In [123]:
# generating queries for all documents in the given directory
#category="tech"
folder_path= "back_up/wikis"
for filename in sorted(os.listdir(folder_path),key=lambda x: int(x.split(".")[0])):
    path = f"back_up/wikis/{filename}"
    generate_queries_for_wiki(filename,df)
    print(f"Processing file: {filename}    ", end="\r")

Processing file: 140.txt    

In [124]:
df.to_csv("back_up/queries_wiki.csv",index=False)

Prepare the queries to suite both models fastText and Word2Vec

In [13]:
import numpy as np
import pandas as pd
import gensim.downloader
import fasttext

In [14]:
training_set = gensim.downloader.load('word2vec-google-news-300')

In [15]:
model=fasttext.load_model("cc.en.300.bin")



In [43]:
df_queries=pd.read_csv("back_up/queries_wiki.csv")
fasttext_queries=pd.read_csv("back_up/queries_wiki_fasttext.csv")
gensim_queries=pd.read_csv("back_up/queries_wiki_gensim.csv")
fasttext_queries.head()

Unnamed: 0,document,query,mean_vec
0,0.txt,government total august south robert national ...,"[-0.02220891, -0.008327238, -0.015622603, 0.04..."
1,1.txt,white health retrieved world american election...,"[-0.02980545, 0.011205166, -0.010511813, 0.053..."
2,2.txt,prince since on queen prime also bbc took in r...,"[-0.01771623, 0.006959884, -0.018862734, 0.054..."
3,3.txt,culture census business january many congress ...,"[-0.01835768, -0.0015837983, -0.01926455, 0.04..."
4,4.txt,messi in serie october mark united his decembe...,"[-0.0088726785, -0.0036963962, -0.04611436, 0...."


In [45]:
df_queries["mean_vec"]=[[] for i in range(0,len(df_queries))]

In [43]:
def mean_vector_gensim(input):
    keys=[x for x in input.split() if training_set.__contains__(x)]
    meanVec=np.mean(np.array(list(map(lambda x :
                                       training_set.get_vector(x) ,
                                         keys))),axis=0)
    return meanVec

In [49]:
def mean_vecs_fasttext(df):
    df["mean_vec"] = df["query"].apply(lambda x: list(np.mean([model.get_word_vector(w) for w in x.split()], axis=0)))

In [45]:
mean_vecs_fasttext(fasttext_queries)

In [47]:
bbc_queries_fasttext=pd.read_csv("back_up/labeld_queries_fasttext.csv")

In [50]:
mean_vecs_fasttext(bbc_queries_fasttext)

In [53]:
bbc_queries_fasttext.to_csv("back_up/labeld_queries_fasttext.csv")

In [46]:
#fasttext_queries.to_csv("back_up/queries_wiki_fasttext.csv",index=False)

In [32]:
#df_queries.to_csv("back_up/queries_wiki_fasttext.csv",index=False)
fasttext_queries.head()

Unnamed: 0,document,query,mean_vec
0,0.txt,government total august south robert national ...,"[0.0403925, -0.40450022, -0.15040405, -0.09556..."
1,1.txt,white health retrieved world american election...,"[0.037646394, -0.39482304, -0.14960004, -0.095..."
2,2.txt,prince since on queen prime also bbc took in r...,"[0.04185474, -0.4012402, -0.15507908, -0.10232..."
3,3.txt,culture census business january many congress ...,"[0.038593266, -0.40238896, -0.14885493, -0.098..."
4,4.txt,messi in serie october mark united his decembe...,"[0.045165416, -0.40830454, -0.15094325, -0.100..."


In [46]:
#after computing the mean vectors for the fastext model now we compute tthem for word2vec
for i,query in enumerate(df_queries["query"]):
    df_queries.at[i,"mean_vec"]=list(mean_vector_gensim(query))

In [47]:
#df_queries.to_csv("back_up/queries_wiki_gensim.csv",index=False)
gensim_queries.head()

Unnamed: 0,document,query
0,0.txt,government total august south robert national ...
1,1.txt,white health retrieved world american election...
2,2.txt,prince since on queen prime also bbc took in r...
3,3.txt,culture census business january many congress ...
4,4.txt,messi in serie october mark united his decembe...


In [85]:
df.head()

Unnamed: 0,document,query,mean_vec
0,0.txt,world the united april states archived origina...,[]
1,1.txt,april donald march michael november august oct...,[]
2,2.txt,philip pimlott the september queen in archived...,[]
3,3.txt,university june the february september may dec...,[]
4,4.txt,february may ronaldo madrid march october arch...,[]
