In [29]:
import torch
import contractions
import unidecode
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
import pandas as pd


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco')
Queriesmodel = T5ForConditionalGeneration.from_pretrained('castorini/doc2query-t5-base-msmarco')
Queriesmodel.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [80]:
dataset = load_dataset("multi_news")

Found cached dataset multi_news (/home/cse-p07-g07f/.cache/huggingface/datasets/multi_news/default/1.0.0/2f1f69a2bedc8ad1c5d8ae5148e4755ee7095f465c1c01ae8f85454342065a72)
100%|██████████| 3/3 [00:00<00:00, 1995.70it/s]


In [81]:
# create a pandas dataframe from the dataset
df_train = pd.DataFrame(dataset['train'])
df_test  = pd.DataFrame(dataset['test'])
df_val   = pd.DataFrame(dataset['validation'])
#merge all the dataframes into one
df_docs = pd.concat([df_train, df_test, df_val], ignore_index=True)

In [82]:
df_docs

Unnamed: 0,document,summary
0,"National Archives \n \n Yes, it’s that time ag...",– The unemployment rate dropped to 8.2% last m...
1,LOS ANGELES (AP) — In her first interview sinc...,"– Shelly Sterling plans ""eventually"" to divorc..."
2,"GAITHERSBURG, Md. (AP) — A small, private jet ...",– A twin-engine Embraer jet that the FAA descr...
3,Tucker Carlson Exposes His Own Sexism on Twitt...,– Tucker Carlson is in deep doodoo with conser...
4,A man accused of removing another man's testic...,– What are the three most horrifying words in ...
...,...,...
56211,WASHINGTON — The deep recession wiped out prim...,– Employment rates in the US have bounced back...
56212,Photo by Luca Francesco Giovanni Bertolli/iSto...,– Slate columnist Emily Yoffe is taking a lot ...
56213,"“Ah, what a trifle is a heart, \n \n If once i...",– Alain Carpentier thinks he has an answer to ...
56214,These crawls are part of an effort to archive ...,– Everyone in the UK must have seen the Rotten...


In [94]:
def preprocess(doc):
    doc = unidecode.unidecode(doc) # transliterates any unicode string into the closest possible representation in ascii text.
    doc = contractions.fix(doc) # expands contractions                   
    doc = re.sub('[\t\n]', ' ', doc) # remove newlines and tabs
    doc = re.sub(r'@[A-Za-z0-9_]+', '', doc) # remove mentions
    doc = re.sub(r'#[A-Za-z0-9_]+', '', doc) #remove hashtags
    doc = re.sub(r'https?://[^ ]+', '', doc)
    doc = re.sub(r'www.[^ ]+', '', doc)
    doc = re.sub('[^A-Za-z]+', ' ', doc) # remove all characters other than alphabet
    doc = re.sub(' +', ' ', doc) # substitute any number of space with one space only
    doc = doc.strip() # remove spaces from begining and end and lower the text
    return doc

In [84]:
df_docs.shape

(56216, 2)

In [85]:
df_docs= df_docs[df_docs['document']!=""]
df_docs= df_docs[df_docs['summary']!=""]

In [86]:
df_docs['document']= df_docs['document'].apply(lambda x: preprocess(x))
df_docs['summary']= df_docs['summary'].apply(lambda x: preprocess(x))


Unnamed: 0,document,summary
0,National Archives Yes it is that time again fo...,The unemployment rate dropped to last month bu...
1,LOS ANGELES AP In her first interview since th...,Shelly Sterling plans eventually to divorce he...
2,GAITHERSBURG Md AP A small private jet has cra...,A twin engine Embraer jet that the FAA describ...
3,Tucker Carlson Exposes His Own Sexism on Twitt...,Tucker Carlson is in deep doodoo with conserva...
4,A man accused of removing another man s testic...,What are the three most horrifying words in th...


### Document VS. Summary

In [57]:
corpus= df_docs['document'].tolist()
queries= df_docs['summary'].tolist()

In [30]:
from sentence_transformers import  SentenceTransformer
model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L6-cos-v5')

In [17]:
corpusVectors= model.encode(corpus)
queriesVectors= model.encode(queries)

In [18]:
def search(queryVector, corpusVectors):
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=5, algorithm='brute').fit(corpusVectors)
    distances, indices = nbrs.kneighbors([queryVector])
    return indices

#function to get the top 5 similar documents for each query
def getSimilarDocs(queryVector, corpusVectors, corpus):
    indices= search(queryVector, corpusVectors)
    return [corpus[i] for i in indices[0]]

In [19]:
right=0
for idx, query in enumerate(queries):
    outCorups= getSimilarDocs(queriesVectors[idx], corpusVectors, corpus)
    right+= corpus[idx] in outCorups

In [21]:
(right/len(corpus))*100

85.59940219905349

### Summary vs. Queries

In [58]:
docs= df_docs['document'].tolist()
queries= df_docs['summary'].tolist()

In [23]:
in__= df_docs['summary'].tolist()

In [31]:
def get_query(doc):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  input_ids = tokenizer.encode(doc, return_tensors='pt').to(device)
  my_set = set()
  outputs = Queriesmodel.generate(
      input_ids=input_ids,
      max_length=2000,
      do_sample=True,
      top_k=10,
      num_return_sequences=20)

  for i in range(20):
      my_set.add(tokenizer.decode(outputs[i], skip_special_tokens=True))
  return ". ".join(my_set)

In [32]:
Myfile=open('FinalOut2.csv','w')

In [33]:
for doc in in__:
  Myfile.write(doc)
  Myfile.write(",")
  Myfile.write(get_query(doc))
  Myfile.write('\n')
  torch.cuda.empty_cache()

Token indices sequence length is longer than the specified maximum sequence length for this model (576 > 512). Running this sequence through the model will result in indexing errors


In [100]:
df= pd.read_csv('FinalOut.csv')
df.head()

Unnamed: 0,summary,query
0,The unemployment rate dropped to last month bu...,why is unemployment rate dropping?. why did th...
1,Shelly Sterling plans eventually to divorce he...,how did shelly sterling get divorced. is shell...
2,A twin engine Embraer jet that the FAA describ...,what was the airplane in the blaze. how did th...
3,Tucker Carlson is in deep doodoo with conserva...,who was tweeting at sarah palin. who tweeted a...
4,What are the three most horrifying words in th...,what is the word for testicles. what is the mo...


In [101]:
#convert the query in each row to a list
df['query']= df['query'].apply(lambda x: x.split('.'))
df.head()

Unnamed: 0,summary,query
0,The unemployment rate dropped to last month bu...,"[why is unemployment rate dropping?, why did ..."
1,Shelly Sterling plans eventually to divorce he...,"[how did shelly sterling get divorced, is she..."
2,A twin engine Embraer jet that the FAA describ...,"[what was the airplane in the blaze, how did ..."
3,Tucker Carlson is in deep doodoo with conserva...,"[who was tweeting at sarah palin, who tweeted..."
4,What are the three most horrifying words in th...,"[what is the word for testicles, what is the ..."


In [102]:
#add each element in the list of queries to a new row with the same document
df2= df.explode('query')
df2.head()

Unnamed: 0,summary,query
0,The unemployment rate dropped to last month bu...,why is unemployment rate dropping?
0,The unemployment rate dropped to last month bu...,why did the unemployment rate drop in america
0,The unemployment rate dropped to last month bu...,why is labor market rate dropping
0,The unemployment rate dropped to last month bu...,why did the unemployment rate drop?
0,The unemployment rate dropped to last month bu...,why did the unemployment rate drop


In [103]:
df2.drop_duplicates()

Unnamed: 0,summary,query
0,The unemployment rate dropped to last month bu...,why is unemployment rate dropping?
0,The unemployment rate dropped to last month bu...,why did the unemployment rate drop in america
0,The unemployment rate dropped to last month bu...,why is labor market rate dropping
0,The unemployment rate dropped to last month bu...,why did the unemployment rate drop?
0,The unemployment rate dropped to last month bu...,why did the unemployment rate drop
...,...,...
56203,Alain Carpentier thinks he has an answer to ch...,who invented the first self regulating heart
56203,Alain Carpentier thinks he has an answer to ch...,who invented the first self regulation heart ...
56203,Alain Carpentier thinks he has an answer to ch...,which company has made the artificial heart
56203,Alain Carpentier thinks he has an answer to ch...,who created carmat heart


In [107]:
corpus= df2['summary'].tolist()
queries= df2['query'].tolist()

In [108]:
from sentence_transformers import  SentenceTransformer
model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L6-cos-v5')

In [110]:
corpusVectors= model.encode(corpus)
queriesVectors= model.encode(queries)

In [111]:
def search(queryVector, corpusVectors):
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=5, algorithm='brute').fit(corpusVectors)
    distances, indices = nbrs.kneighbors([queryVector])
    return indices

#function to get the top 5 similar documents for each query
def getSimilarDocs(queryVector, corpusVectors, corpus):
    indices= search(queryVector, corpusVectors)
    return [corpus[i] for i in indices[0]]

right=0
for idx, query in enumerate(queries):
    outCorups= getSimilarDocs(queriesVectors[idx], corpusVectors, corpus)
    right+= corpus[idx] in outCorups

print((right/len(corpus))*100)

### Document Vs Queries

In [90]:
#inner join df and df_docs on summary
df= df.merge(df_docs, on='summary', how='inner')
df.head()

Unnamed: 0,summary,query,document
0,The unemployment rate dropped to last month bu...,why is unemployment rate dropping?. why did th...,National Archives Yes it is that time again fo...
1,Shelly Sterling plans eventually to divorce he...,how did shelly sterling get divorced. is shell...,LOS ANGELES AP In her first interview since th...
2,A twin engine Embraer jet that the FAA describ...,what was the airplane in the blaze. how did th...,GAITHERSBURG Md AP A small private jet has cra...
3,Tucker Carlson is in deep doodoo with conserva...,who was tweeting at sarah palin. who tweeted a...,Tucker Carlson Exposes His Own Sexism on Twitt...
4,What are the three most horrifying words in th...,what is the word for testicles. what is the mo...,A man accused of removing another man s testic...


In [91]:
corpus= df_docs['document'].tolist()
queries= df['query'].tolist()

In [92]:
corpusVectors= model.encode(corpus)
queriesVectors= model.encode(queries)

In [93]:
def search(queryVector, corpusVectors):
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=5, algorithm='brute').fit(corpusVectors)
    distances, indices = nbrs.kneighbors([queryVector])
    return indices

#function to get the top 5 similar documents for each query
def getSimilarDocs(queryVector, corpusVectors, corpus):
    indices= search(queryVector, corpusVectors)
    return [corpus[i] for i in indices[0]]

right=0
for idx, query in enumerate(queries):
    outCorups= getSimilarDocs(queriesVectors[idx], corpusVectors, corpus)
    right+= corpus[idx] in outCorups

print((right/len(corpus))*100)

74.1984841475999
