In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForMaskedLM
from torch.nn import functional as F
import torch


In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataframe = pd.read_csv('/content/drive/MyDrive/df_5000.csv')

In [None]:
df = dataframe[:30]

In [None]:
split_arr = []
for i in range(30):
  text = df['QUESTION'][i]
  split_arr.append(text.split("[BLANK]"))

df_split = pd.DataFrame(split_arr,columns=['split1','split2'])

In [None]:
df = pd.concat([df,df_split],axis=1)
df = df[['QUESTION','ANSWER','split1','split2']]

In [None]:
df

Unnamed: 0,QUESTION,ANSWER,split1,split2
0,Increased content consumption on mobile device...,viewing,Increased content consumption on mobile device...,reached 33% of total online video plays in th...
1,National Australia Bank's earnings prospects w...,offsetting,National Australia Bank's earnings prospects w...,a weakness in consumer banking. Market income...
2,The success of TransCanada's $15 billion claim...,overturns,The success of TransCanada's $15 billion claim...,the denial. Most NAFTA disputes settle for an...
3,German carmakers may continue to lead factory ...,ran,German carmakers may continue to lead factory ...,at full capacity last year as production fell...
4,New Cotai resorts have helped fuel citywide re...,attract,New Cotai resorts have helped fuel citywide re...,guests and revenue volatility as recent cityw...
5,The combination of higher interest rates and p...,purchasing,The combination of higher interest rates and p...,Chubb Corp. and boosted the acquired portfoli...
6,Saudi Arabia baby food sales will surge 11% a ...,speed,Saudi Arabia baby food sales will surge 11% a ...,up implementation of policies aimed at boosti...
7,Nortel Networks Inc. is likely to request by J...,denied,Nortel Networks Inc. is likely to request by J...,such a request. The 3rd Circuit now must deci...
8,Makers of brand-name drugs and biologics will ...,delayed,Makers of brand-name drugs and biologics will ...,"multiple Obamacare-imposed taxes, including t..."
9,Two California owners of the shuttered San Ono...,supplied,Two California owners of the shuttered San Ono...,"defective steam generators, was held liable b..."


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased',return_dict = True)

In [None]:
labels = []
for i in range(30):
  labels_one_sentence = []
  text = df['split1'][i] + tokenizer.mask_token + df['split2'][i]

  input = tokenizer.encode_plus(text, return_tensors = "pt")
  mask_index = torch.where(input["input_ids"][0] == tokenizer.mask_token_id)


  output = model(**input)
  logits = output.logits

  softmax = F.softmax(logits, dim = -1)
  mask_word = softmax[0, mask_index, :]

  top_5 = torch.topk(mask_word, 5, dim = 1)[1][0]
  for token in top_5:
    word = tokenizer.decode([token])
    labels_one_sentence.append(word)

  labels.append(labels_one_sentence)

In [None]:
df_label = pd.DataFrame(labels)


In [None]:
df_answer = pd.concat([df['ANSWER'],df_label],axis=1)
df_answer

Unnamed: 0,ANSWER,0,1,2,3,4
0,viewing,devices,content,videos,games,users
1,offsetting,despite,but,and,indicating,showing
2,overturns,accepts,wins,rejects,disputes,challenges
3,ran,was,were,worked,remained,stood
4,attract,its,attract,the,accommodate,cater
5,purchasing,acquiring,purchasing,buying,acquisition,creating
6,speed,speed,drive,keep,hold,speeds
7,denied,denied,rejected,refused,made,granted
8,delayed,removed,eliminated,increased,reduced,repealed
9,supplied,manufactured,produced,manufactures,built,installed


In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/Tdf_5000.csv')

In [None]:
split_arr = []
for i in range(30):
  text = df_test['QUESTION'][i]
  split_arr.append(text.split("[BLANK]"))

split_arr
df_T = df_test.iloc[:30,:]
df_split = pd.DataFrame(split_arr,columns=['split1','split2'])
df_T = pd.concat([df_T,df_split],axis=1)
df_T = df_T[['QUESTION','ANSWER','split1','split2']]
df_T

Unnamed: 0,QUESTION,ANSWER,split1,split2
0,While construction has begun on the Nord Strea...,grant,While construction has begun on the Nord Strea...,the necessary approvals. Nord Stream 2 is als...
1,"A case against 47 people, including OMV execut...",passed,"A case against 47 people, including OMV execut...","on to the buyer. Yet, the allegations could a..."
2,Property insurers are positioning themselves t...,tied,Property insurers are positioning themselves t...,to an unsustainable system. Modest reforms pr...
3,Low-cost European airlines averaged 9% capacit...,garner,Low-cost European airlines averaged 9% capacit...,"higher yields. EasyJet added only 2.5%, compa..."
4,Some of the junket operators integral to Macau...,plunged,Some of the junket operators integral to Macau...,42% in the first quarter. With close relation...
5,Recovering state budgets are supporting road-r...,led,Recovering state budgets are supporting road-r...,"by pavement spending, up 8.5%. Pavement const..."
6,An increase in the number of U.S. hotel rooms ...,experience,An increase in the number of U.S. hotel rooms ...,"this decline first, since expansion there has..."
7,"BlackRock, Prudential Financial and other mort...",narrowing,"BlackRock, Prudential Financial and other mort...",the $98 billion lawsuit against the bank in i...
8,Legislation that would give the U.S. Departmen...,controls,Legislation that would give the U.S. Departmen...,"either chamber of Congress, this measure favo..."
9,Neither side of the legal challenge to the EPA...,committed,Neither side of the legal challenge to the EPA...,to seeing the process through to the end. Law...


In [None]:
labels = []
for i in range(30):
  labels_one_sentence = []
  text = df_T['split1'][i] + tokenizer.mask_token + df_T['split2'][i]

  input = tokenizer.encode_plus(text, return_tensors = "pt")
  mask_index = torch.where(input["input_ids"][0] == tokenizer.mask_token_id)


  output = model(**input)
  logits = output.logits

  softmax = F.softmax(logits, dim = -1)
  mask_word = softmax[0, mask_index, :]

  top_5 = torch.topk(mask_word, 5, dim = 1)[1][0]
  for token in top_5:
    word = tokenizer.decode([token])
    labels_one_sentence.append(word)

  labels.append(labels_one_sentence)

In [None]:
df_label = pd.DataFrame(labels)


In [None]:
df_answer = pd.concat([df_T['ANSWER'],df_label],axis=1)
df_answer

Unnamed: 0,ANSWER,0,1,2,3,4
0,grant,obtain,issue,get,receive,give
1,passed,passed,passing,pass,sold,passes
2,tied,exposed,vulnerable,connected,committed,susceptible
3,garner,achieve,provide,deliver,offer,produce
4,plunged,rose,increased,grew,fell,dropped
5,led,followed,accompanied,backed,aided,supported
6,experience,see,experience,notice,face,have
7,narrowing,in,ending,dismissing,regarding,ruling
8,controls,controls,in,wins,dominates,leads
9,committed,committed,dedicated,committing,devoted,close


Make level 1-6 : 4.37min

In [None]:
input = tokenizer.encode_plus("rely")
input.input_ids[1]

11160

In [None]:
questions = ['' for i in range(30)]
answers1 = [[] for i in range(30)]
labels1 = [[] for i in range(30)]
answers2 = [[] for i in range(30)]
labels2 = [[] for i in range(30)]
answers3 = [[] for i in range(30)]
labels3 = [[] for i in range(30)]
answers4 = [[] for i in range(30)]
labels4 = [[] for i in range(30)]
answers5 = [[] for i in range(30)]
labels5 = [[] for i in range(30)]
answers6 = [[] for i in range(30)]
labels6 = [[] for i in range(30)]
for j in range(30):
  questions[j] = df['QUESTION'][j]
  text = nltk.word_tokenize(questions[j]);
  pos_tagged = nltk.pos_tag(text)
  for ele in pos_tagged:
    
    if(ele[1].startswith('VB')):
      labels_one_sentence1 = []
      labels_one_sentence2 = []
      labels_one_sentence3 = []
      labels_one_sentence4 = []
      labels_one_sentence5 = []
      labels_one_sentence6 = []
      split_text = questions[j].split(ele[0], 1)
      text1 = split_text[0] + tokenizer.mask_token + split_text[1]
       
      input = tokenizer.encode_plus(text1, return_tensors = "pt")
      mask_index = torch.where(input["input_ids"][0] == tokenizer.mask_token_id)

      output = model(**input)
      logits = output.logits

      softmax = F.softmax(logits, dim = -1)
      mask_word = softmax[0, mask_index, :]

      top_10 = torch.topk(mask_word, 30522, dim = 1)[1][0]
      level1 = top_10[1]
      level2 = top_10[1:10]
      level3 = top_10[10:100]
      level4 = top_10[100:1000]
      level5 = top_10[1000:10000]
      level6 = top_10[10000:]
      # level 1
      if tokenizer.encode_plus(ele[0]).input_ids[1] in level1:
        answers1[j].append(ele[0])
        word = tokenizer.decode([level1])
        labels_one_sentence1.append(word)
        labels1[j].append(labels_one_sentence1)
      #  level 2
      elif tokenizer.encode_plus(ele[0]).input_ids[1] in level2:
        answers2[j].append(ele[0])
        for token in level2:
          word = tokenizer.decode([token])
          labels_one_sentence2.append(word)
        labels2[j].append(labels_one_sentence2)
      # level 3 
      elif tokenizer.encode_plus(ele[0]).input_ids[1] in level3:
        answers3[j].append(ele[0])
        for token in level3:
          word = tokenizer.decode([token])
          labels_one_sentence3.append(word)
        labels3[j].append(labels_one_sentence3)
        # level 4 
      elif tokenizer.encode_plus(ele[0]).input_ids[1] in level4:
        answers4[j].append(ele[0])
        for token in level4:
          word = tokenizer.decode([token])
          labels_one_sentence4.append(word)
        labels4[j].append(labels_one_sentence4)
      #  level 5
      elif tokenizer.encode_plus(ele[0]).input_ids[1] in level5:
        answers5[j].append(ele[0])
        for token in level5:
          word = tokenizer.decode([token])
          labels_one_sentence5.append(word)
        labels5[j].append(labels_one_sentence5)
      #  level 6
      elif tokenizer.encode_plus(ele[0]).input_ids[1] in level6:
        answers6[j].append(ele[0])
        for token in level6:
          word = tokenizer.decode([token])
          labels_one_sentence6.append(word)
        labels6[j].append(labels_one_sentence6)

In [None]:
arr1 = [[] for i in range(30)]

for i in range(30):
  arr1[i].append(questions[i])
  for j in range(len(answers1[i])):
    # print("i j ",i,j,len(answers[i]))
    arr1[i].append(answers1[i][j])
    # print(answers1[i][j])
    arr1[i].append(labels1[i][j])
    # print( "labels " ,labels1[i][j])


arr2 = [[] for i in range(30)]

for i in range(30):
  arr2[i].append(questions[i])
  for j in range(len(answers2[i])):
    arr2[i].append(answers2[i][j])
    arr2[i].append(labels2[i][j])

arr3 = [[] for i in range(30)]

for i in range(30):
  arr3[i].append(questions[i])
  for j in range(len(answers3[i])):
    arr3[i].append(answers3[i][j])
    arr3[i].append(labels3[i][j])

arr4 = [[] for i in range(30)]

for i in range(30):
  arr4[i].append(questions[i])
  for j in range(len(answers4[i])):
    arr4[i].append(answers4[i][j])
    arr4[i].append(labels4[i][j])

arr5 = [[] for i in range(30)]

for i in range(30):
  arr5[i].append(questions[i])
  for j in range(len(answers5[i])):
    arr5[i].append(answers5[i][j])
    arr5[i].append(labels5[i][j])

arr6 = [[] for i in range(30)]

for i in range(30):
  arr6[i].append(questions[i])
  for j in range(len(answers6[i])):
    arr6[i].append(answers6[i][j])
    arr6[i].append(labels6[i][j])


In [None]:
df_level1 = pd.DataFrame(arr1)
df_level1
# answers1,labels1

Unnamed: 0,0,1,2,3,4
0,Increased content consumption on mobile device...,shifting,[shifting],,
1,National Australia Bank's earnings prospects w...,rely,[rely],,
2,The success of TransCanada's $15 billion claim...,get,[get],resolve,[resolve]
3,German carmakers may continue to lead factory ...,have,[have],have,[have]
4,New Cotai resorts have helped fuel citywide re...,,,,
5,The combination of higher interest rates and p...,leads,[leads],,
6,Saudi Arabia baby food sales will surge 11% a ...,switching,[switching],,
7,Nortel Networks Inc. is likely to request by J...,go,[go],,
8,Makers of brand-name drugs and biologics will ...,,,,
9,Two California owners of the shuttered San Ono...,held,[held],,


In [None]:
df_level2 = pd.DataFrame(arr2)
df_level2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Increased content consumption on mobile device...,debut,"[air, compete, follow, market, debut, premiere...",,,,,,,,
1,National Australia Bank's earnings prospects w...,contributed,"[produced, provided, generate, contributed, co...",proposed,"[new, section, proposed, initial, planned, usd...",,,,,,
2,The success of TransCanada's $15 billion claim...,,,,,,,,,,
3,German carmakers may continue to lead factory ...,fell,"[reached, was, fell, rose, grew, dropped, up, ...",,,,,,,,
4,New Cotai resorts have helped fuel citywide re...,faces,"[features, enjoys, boasts, had, experiences, i...",,,,,,,,
5,The combination of higher interest rates and p...,expanded,"[reduced, raised, doubled, cut, expanded, decr...",boosted,"[improved, enhanced, adjusted, doubled, raised...",,,,,,
6,Saudi Arabia baby food sales will surge 11% a ...,baby,"[', baby, child, household, maternal, ,, state...",working,"[unmarried, single, birth, young, new, working...",,,,,,
7,Nortel Networks Inc. is likely to request by J...,request,"[require, request, decide, ask, propose, recom...",send,"[take, send, appeal, allow, turn, move, make, ...",,,,,,
8,Makers of brand-name drugs and biologics will ...,affecting,"[on, for, from, of, affecting, covering, withi...",rises,"[increased, fell, dropped, grew, jumped, decre...",,,,,,
9,Two California owners of the shuttered San Ono...,had,"[make, struck, had, have, cut, wanted, strike,...",,,,,,,,


In [None]:
df_level3 = pd.DataFrame(arr3)
df_level3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Increased content consumption on mobile device...,drive,"[require, pressure, drive, stimulate, spur, he...",release,"[develop, release, produce, support, embrace, ...",reached,"[reached, contributed, garnered, consumed, pro...",doubling,"[increase, improving, away, ranging, and, trai...",,
1,National Australia Bank's earnings prospects w...,showed,"[yield, had, feature, support, exhibit, ensure...",supports,"[influences, boost, decreases, raises, increas...",Reduced,"[higher, interest, external, government, curre...",suffer,"[draw, emerge, recover, fall, suffer, arise, p...",,
2,The success of TransCanada's $15 billion claim...,hinges,"[focuses, rests, centred, center, ,, hinges, c...",showing,"[denying, because, thinking, stating, feeling,...",remaining,"[tribunal, civil, international, same, bilater...",,,,
3,German carmakers may continue to lead factory ...,including,"[especially, alongside, replacing, while, as, ...",boosts,"[for, helps, means, decreases, threatens, stop...",comprise,"[exceeds, make, constitute, include, reaches, ...",match,"[prevent, accommodate, allow, have, maintain, ...",,
4,New Cotai resorts have helped fuel citywide re...,fuel,"[further, have, provide, drive, sustain, recor...",led,"[pushed, fueled, countered, absorbed, blocked,...",,,,,,
5,The combination of higher interest rates and p...,push,"[keep, screw, bring, move, turn, push, hold, a...",Excluding,"[excluding, from, at, in, including, for, by, ...",said,"[for, loses, estimates, realizes, means, gains...",,,,
6,Saudi Arabia baby food sales will surge 11% a ...,surge,"[rose, climb, raise, rate, peak, be, triple, j...",formula,"[cash, food, women, petroleum, nutrition, drug...",accelerate,"[grow, accelerate, reverse, return, resume, im...",boosting,"[promoting, reduced, enhancing, raising, expan...",opt,"[have, starting, refuse, returning, unable, tr..."
7,Nortel Networks Inc. is likely to request by J...,certified,"[opened, reviewed, submitted, recommended, dee...",take,"[drop, accept, withdraw, suspend, consider, or...",,,,,,
8,Makers of brand-name drugs and biologics will ...,left,"[included, raises, raising, rather, excluded, ...",look,"[choose, look, wish, want, expect, desire, wor...",highlight,"[combat, remedy, end, avoid, alleviate, contro...",,,,
9,Two California owners of the shuttered San Ono...,saying,"[after, where, when, that, meaning, claiming, ...",eliminated,"[limits, restricted, threatened, affects, was,...",is,"[##ko, subsidiary, ##we, ##ji, ・, ##j, ##ki, ....",get,"[the, cut, benefit, lose, avoid, go, last, its...",,


In [None]:
df_level4 = pd.DataFrame(arr4)
df_level4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Increased content consumption on mobile device...,,,,,,,,,,
1,National Australia Bank's earnings prospects w...,,,,,,,,,,
2,The success of TransCanada's $15 billion claim...,resulted,"[regime, pentagon, program, citizenship, congr...",,,,,,,,
3,German carmakers may continue to lead factory ...,lead,"[extend, force, close, curb, review, regulate,...",'s,"[rests, polled, historically, numbered, stars,...",preferred,"[representative, agile, coveted, economic, cre...",,,,
4,New Cotai resorts have helped fuel citywide re...,,,,,,,,,,
5,The combination of higher interest rates and p...,acquired,"[ongoing, large, required, its, bank, senior, ...",,,,,,,,
6,Saudi Arabia baby food sales will surge 11% a ...,,,,,,,,,,
7,Nortel Networks Inc. is likely to request by J...,allocating,"[the, issuing, banning, protecting, honoring, ...",,,,,,,,
8,Makers of brand-name drugs and biologics will ...,,,,,,,,,,
9,Two California owners of the shuttered San Ono...,,,,,,,,,,


In [None]:
df_level5 = pd.DataFrame(arr5)
df_level5

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Increased content consumption on mobile device...,,,,,,,,
1,National Australia Bank's earnings prospects w...,bode,"[tissue, lock, ##ela, potent, patients, reflec...",,,,,,
2,The success of TransCanada's $15 billion claim...,be,"[##fl, chambers, fu, league, sandra, providenc...",win,"[can, ##sw, ##nka, ##ick, ##ger, mason, ##nc, ...",do,"[17, austrian, min, scheme, consolidated, dire...",,
3,German carmakers may continue to lead factory ...,is,"[##firm, sunday, kepler, ##tori, ##ox, ##thy, ...",,,,,,
4,New Cotai resorts have helped fuel citywide re...,loom,"[traveled, alberta, poll, know, swings, projec...",,,,,,
5,The combination of higher interest rates and p...,,,,,,,,
6,Saudi Arabia baby food sales will surge 11% a ...,breastfeed,"[unite, war, resolve, introduce, encourage, si...",breastfeed,"[unite, war, resolve, introduce, encourage, si...",,,,
7,Nortel Networks Inc. is likely to request by J...,,,,,,,,
8,Makers of brand-name drugs and biologics will ...,,,,,,,,
9,Two California owners of the shuttered San Ono...,,,,,,,,


In [None]:
df_level6 = pd.DataFrame(arr6)
df_level6

Unnamed: 0,0,1,2
0,Increased content consumption on mobile device...,,
1,National Australia Bank's earnings prospects w...,,
2,The success of TransCanada's $15 billion claim...,,
3,German carmakers may continue to lead factory ...,,
4,New Cotai resorts have helped fuel citywide re...,,
5,The combination of higher interest rates and p...,,
6,Saudi Arabia baby food sales will surge 11% a ...,,
7,Nortel Networks Inc. is likely to request by J...,,
8,Makers of brand-name drugs and biologics will ...,,
9,Two California owners of the shuttered San Ono...,,
