In [13]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

In [11]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [3]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
inputs = tokenizer(question, text, return_tensors="pt")
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits

In [5]:
print(start_scores)
print(end_scores)

tensor([[ 0.4227,  0.0370,  0.3075, -0.0382, -0.0138,  0.3623,  0.0157, -0.0264,
          0.0104,  0.2433,  0.3268,  0.5673,  0.6411,  0.1038]],
       grad_fn=<CloneBackward0>)
tensor([[-0.1095, -0.0266,  0.0456, -0.0952,  0.1681,  0.1389,  0.1902,  0.0355,
          0.2066,  0.0817, -0.0455,  0.1083,  0.2039,  0.3825]],
       grad_fn=<CloneBackward0>)


In [6]:
print(inputs)

{'input_ids': tensor([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958, 27227,  2001,
          1037,  3835, 13997,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [10]:
model_pretrained = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer_pretrained = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [3]:
df = pd.read_csv("dataset/amazon_data.csv")
df

Unnamed: 0,User,Company
0,Way to drop the ball on customer service @1158...,@115820 I'm sorry we've let you down! Without ...
1,@AmazonHelp 3 different people have given 3 di...,@115820 We'd like to take a further look into ...
2,@115823 I want my amazon payments account CLOS...,@115822 I am unable to affect your account via...
3,@115828 How about you guys figure out my Xbox ...,@115826 I'm sorry for the wait. You'll receive...
4,@AmazonHelp @115826 Yeah this is crazy we’re l...,@115827 Thanks for your patience. ^KM
...,...,...
131604,@AmazonHelp I sent you guys a DM regarding the...,@328597 We're unable to access customer accoun...
131605,This is happening in my area w/@115821 “Prime”...,"@777901 I'm sorry for the delay, Brenda! We st..."
131606,@132994 @132995 @115850 got my #OnePlus5T at 8...,@823783 Woohoo! That's awesome! Hope you love ...
131607,@115850 @132994 No exchange available for #One...,@823802 The Exchange Offer is currently availa...


In [18]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
merged_rows = train_data[train_data.columns].astype(str).apply(lambda x: ' '.join(x), axis = 1)

In [19]:
text = " ".join(list(merged_rows))

In [20]:
len(text)

26550262

In [22]:
train_data['Text_Data'] = text
test_data['Text_Data'] = text
train_data

Unnamed: 0,User,Company,Text_Data
122933,I think amazon delivered my order to the wrong...,@782783 I'm sorry for the trouble! I know how ...,I think amazon delivered my order to the wrong...
42859,@115850 @AmazonHelp @AmazonHelp I cancelled an...,@305441 on ensuring that such instances aren't...,I think amazon delivered my order to the wrong...
104375,@AmazonHelp Order number 3128 is going to arri...,@647669 I'm positive the product would be deli...,I think amazon delivered my order to the wrong...
99068,I ordered 18x18 pillows on @115821 prime then ...,@645659 Hm. That's strange! Were the items bei...,I think amazon delivered my order to the wrong...
56767,@AmazonHelp my right address and zipcode!,@364938 Can you please confirm who the carrier...,I think amazon delivered my order to the wrong...
...,...,...,...
54886,@115850 But I added the product in my cart and...,@254639 Please get in touch with our team usin...,I think amazon delivered my order to the wrong...
110268,"@AmazonHelp No, no reply to my email as yet. I...","@705551 Hi Ruth, glad to hear that you've been...",I think amazon delivered my order to the wrong...
119879,"@AmazonHelp Yes, but you don't want it\rAlso y...",@763955 You can get directly in contact with u...,I think amazon delivered my order to the wrong...
103694,@AmazonHelp I just pre-ordered it THANK YOU SO...,@147108 You're always more than welcome. Our h...,I think amazon delivered my order to the wrong...


In [37]:
row_questions = list(train_data['User'][:10])
row_text = list(train_data['User'][:10])
data_encoded = tokenizer_pretrained.encode(row_questions, row_text)
#print(data_encoded)
data_decoded = tokenizer_pretrained.decode(data_encoded)
#print(data_decoded)

In [38]:
print(data_encoded)
print(data_decoded)

[101, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 102, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 102]
[CLS] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [SEP] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [SEP]


In [36]:
row = list(train_data['User'][:10])
row

['I think amazon delivered my order to the wrong mailbox and imma cry, i need it for thanksgiving 😩😩😩😩',
 '@115850 @AmazonHelp @AmazonHelp I cancelled and Ordered the same product through @118702.\nI hope they dont follow the same practice.',
 "@AmazonHelp Order number 3128 is going to arrive today, let's see you people have enough potential to make this delivery a successful one or not😊👍",
 'I ordered 18x18 pillows on @115821 prime then why does it say its prime when they are coming from China?  🤦🏼\u200d♀️',
 '@AmazonHelp my right address and zipcode!',
 '@AmazonHelp yesterday via twitter. As we heard nothing, we emailed. You need to seriously think about @122232 and whether should use them.',
 "@AmazonHelp I did contact the same and I'm getting no useful reply. I just want a genuine answer from you",
 '@115830 so much for next day delivery - I am on day 2 and still my parcel has not been despatched despite chatting twice with your team?',
 '@AmazonHelp You people always escalated iss

In [39]:
from transformers import BertGenerationTokenizer, BertGenerationEncoder

#sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
tokenizer = BertGenerationTokenizer.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

Downloading:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/500 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

In [40]:
last_hidden_states

tensor([[[-0.2266, -0.3594, -0.5205,  ..., -0.2420, -0.0745,  0.1840],
         [ 0.0439, -0.0053, -0.0651,  ..., -0.0555,  0.2638,  0.1241],
         [ 0.0439, -0.0053, -0.0651,  ..., -0.0555,  0.2638,  0.1241],
         [-0.5158, -1.4121,  0.0573,  ..., -0.3035,  0.3464, -0.1623],
         [-0.0948, -1.0221,  0.3417,  ...,  0.1595,  0.2193,  0.4579],
         [-0.3677, -1.2449,  0.1894,  ...,  0.1278,  0.6253,  0.3886]]],
       grad_fn=<NativeLayerNormBackward0>)