In [2]:
!pip install transformers
#!pip install allennlp

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/4c/a0/32e3a4501ef480f7ea01aac329a716132f32f7911ef1c2fac228acc57ca7/transformers-2.6.0-py3-none-any.whl (540kB)
[K     |████████████████████████████████| 542kB 30.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 72.4MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 61.7MB/s 
[?25hCollecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K   

In [0]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from transformers import DistilBertModel
 
 
class SpanBertClassificationModel(nn.Module):
    def __init__(self):
        super(SpanBertClassificationModel,self).__init__()
 
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased').cuda()
        for param in self.bert.parameters():
            param.requires_grad = True
 
        self.hide1 = nn.Linear(768*3,768)
        self.hide2 = nn.Linear(768,384)
 
        self.dropout = nn.Dropout(0.5)
        self.out = nn.Linear(384,2)
 
    def forward(self, indextokens_a,input_mask_a,indextokens_b,input_mask_b):
        embedding_a = self.bert(indextokens_a,input_mask_a)[0]
        embedding_b = self.bert(indextokens_b,input_mask_b)[0]
 
        embedding_a = torch.mean(embedding_a,1)
        embedding_b = torch.mean(embedding_b,1)
 
        abs = torch.abs(embedding_a - embedding_b)
 
 
        target_span_embedding = torch.cat((embedding_a, embedding_b,abs), dim=1)
 
 
        hide_1 = F.relu(self.hide1(target_span_embedding))
        hide_2 = self.dropout(hide_1)
        hide = F.relu(self.hide2(hide_2))
 
 
        out_put = self.out(hide)
        return out_put

In [0]:
from torch.utils.data import DataLoader,Dataset
from transformers import DistilBertModel,DistilBertTokenizer
from allennlp.data.dataset_readers.dataset_utils import enumerate_spans
import torch
from tqdm import tqdm
import time
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
 
class SpanClDataset(Dataset):
    def __init__(self,filename,repeat=1,max_len=512):
        self.max_sentence_length = max_len
        self.max_spans_num = len(enumerate_spans(range(self.max_sentence_length),max_span_width=3))
        self.repeat = repeat
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.data_list = self.read_file(filename)
        self.len = len(self.data_list)
        self.process_data_list = self.process_data() 
 
    def convert_into_indextokens_and_segment_id(self,text):
        tokeniz_text = self.tokenizer.tokenize(text)
        #tokeniz_text = self.tokenizer.encode(text,add_special_tokens=True,)
        indextokens = self.tokenizer.convert_tokens_to_ids(tokeniz_text)
        if len(indextokens)>512:
          indextokens = indextokens[0:512]
        # print(type(indextokens))
        # print(len(indextokens))
        #indextokens = pad_sequences(tokeniz_text,maxlen = max_len, dtype='long', value=0, truncating='post', padding='post')
        input_mask = [1] * len(indextokens) 
 
        pad_indextokens = [0]*(self.max_sentence_length-len(indextokens))
        indextokens.extend(pad_indextokens)
        input_mask_pad = [0]*(self.max_sentence_length-len(input_mask))
        input_mask.extend(input_mask_pad)
 
        segment_id = [0]*self.max_sentence_length
        return indextokens,segment_id,input_mask
 
 
    def read_file(self,filename):
        data_list = []
        df = pd.read_csv(filename)  # csv
        s1, s2, labels = df['descrip'], df['review'], df['label']
        
 
        for sentence_a, sentence_b, label in tqdm(list(zip(s1, s2, labels)),desc="load and process dataset："):
            #if len(sentence_a) <= self.max_sentence_length and len(sentence_b) <= self.max_sentence_length:
            data_list.append((sentence_a, sentence_b, label))
        return data_list
 
    def process_data(self):
        process_data_list = []
        for ele in tqdm(self.data_list,desc="process text："):
            res = self.do_process_data(ele)
            process_data_list.append(res)
        return process_data_list
 
    def do_process_data(self,params):
 
        res = []
        sentence_a = params[0]
        sentence_b = params[1]
        label = params[2]
 
        indextokens_a,segment_id_a,input_mask_a = self.convert_into_indextokens_and_segment_id(sentence_a)
        indextokens_a = torch.tensor(indextokens_a,dtype=torch.long)
        segment_id_a = torch.tensor(segment_id_a,dtype=torch.long)
        input_mask_a = torch.tensor(input_mask_a,dtype=torch.long)
 
        indextokens_b, segment_id_b, input_mask_b = self.convert_into_indextokens_and_segment_id(sentence_b)
        indextokens_b = torch.tensor(indextokens_b, dtype=torch.long)
        segment_id_b = torch.tensor(segment_id_b, dtype=torch.long)
        input_mask_b = torch.tensor(input_mask_b, dtype=torch.long)
 
        label = torch.tensor(int(label))
 
        res.append(indextokens_a)
        res.append(segment_id_a)
        res.append(input_mask_a)
 
 
        res.append(indextokens_b)
        res.append(segment_id_b)
        res.append(input_mask_b)
 
 
        res.append(label)
 
        return res
 
    def __getitem__(self, i):
        item = i
 
        indextokens_a = self.process_data_list[item][0]
        segment_id_a = self.process_data_list[item][1]
        input_mask_a = self.process_data_list[item][2]
 
 
 
        indextokens_b = self.process_data_list[item][3]
        segment_id_b = self.process_data_list[item][4]
        input_mask_b = self.process_data_list[item][5]
 
 
        label = self.process_data_list[item][6]
 
 
        return indextokens_a,input_mask_a,indextokens_b,input_mask_b,label
 
    def __len__(self):
        if self.repeat == None:
            data_len = 10000000
        else:
            data_len = len(self.process_data_list)
        return data_len

Using TensorFlow backend.


In [0]:
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F
 
 
from transformers import AdamW
from tqdm.auto import tqdm
 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 
def train(model,train_loader):
    loss_values = []
    model.to(device)
    model.train()
    criterion = nn.CrossEntropyLoss()
 
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    #设置模型参数的权重衰减
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    #学习率的设置
    optimizer_params = {'lr': 2e-5, 'eps': 1e-8, 'correct_bias': False}
    #AdamW 这个优化器是主流优化器
    optimizer = AdamW(optimizer_grouped_parameters, **optimizer_params)
 
    #学习率调整器，检测准确率的状态，然后衰减学习率
    scheduler = ReduceLROnPlateau(optimizer,mode='max',factor=0.5,min_lr=1e-7, patience=5,verbose= True, threshold=0.0001, eps=1e-08)
 
    t_total = len(train_loader)
    total_epochs = 5
    bestAcc = 0
    correct = 0
    total = 0
    print('Training begin!')
    for epoch in tqdm(range(total_epochs)):
        for step, (indextokens_a,input_mask_a,indextokens_b,input_mask_b,label) in enumerate(train_loader):
            indextokens_a,input_mask_a,indextokens_b,input_mask_b,label = indextokens_a.to(device),input_mask_a.to(device),indextokens_b.to(device),input_mask_b.to(device),label.to(device)
            optimizer.zero_grad()
            out_put = model(indextokens_a,input_mask_a,indextokens_b,input_mask_b)
            loss = criterion(out_put, label)
            _, predict = torch.max(out_put.data, 1)
            correct += (predict == label).sum().item()
            total += label.size(0)
            loss.backward()
            optimizer.step()
 

            train_acc = correct / total
            print("Train Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,loss.item()))

            # if (step + 1) % 500 == 0:
            #     train_acc = correct / total
            #     acc = dev(model, dev_loader)
            #     if bestAcc < acc:
            #         bestAcc = acc
            #         path = 'span_bert_hide_model.pkl'
            #         torch.save(model, path)
            #     print("DEV Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,bestAcc{:.6f}%,dev_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,bestAcc*100,acc*100,loss.item()))
        scheduler.step(bestAcc)
    

 
# def dev(model,dev_loader):
#     model.eval()
#     with torch.no_grad():
#         correct = 0
#         total = 0
#         for step, (
#                 indextokens_a, input_mask_a, indextokens_b, input_mask_b, label) in tqdm(enumerate(
#             dev_loader),desc='Dev Itreation:'):
#             print(step)
#             indextokens_a, input_mask_a, indextokens_b, input_mask_b, label = indextokens_a.to(device), input_mask_a.to(
#                 device), indextokens_b.to(device), input_mask_b.to(device), label.to(device)
#             out_put = model(indextokens_a, input_mask_a, indextokens_b, input_mask_b)
#             _, predict = torch.max(out_put.data, 1)
#             correct += (predict==label).sum().item()
#             total += label.size(0)
#         print(correct)
#         print(total)
#         res = correct / total
#         return res
 
def predict(model,test_loader):
    model.to(device)
    model.eval()
    predicts = []
    predict_probs = []
    with torch.no_grad():
        correct = 0
        total = 0
        for step, (
                indextokens_a, input_mask_a, indextokens_b, input_mask_b, label) in enumerate(
            test_loader):
            #print(step)
            indextokens_a, input_mask_a, indextokens_b, input_mask_b, label = indextokens_a.to(device), input_mask_a.to(
                device), indextokens_b.to(device), input_mask_b.to(device), label.to(device)
            out_put = model(indextokens_a, input_mask_a, indextokens_b, input_mask_b)
            _, predict = torch.max(out_put.data, 1)
 
            pre_numpy = predict.cpu().numpy().tolist()
            predicts.extend(pre_numpy)
            probs = F.softmax(out_put).detach().cpu().numpy().tolist()
            predict_probs.extend(probs)
 
            correct += (predict==label).sum().item()
            #print(correct)
            total += label.size(0)
            #print(total)
        res = correct / total
        print('predict_Accuracy : {} %'.format(100 * res))
        return predicts,predict_probs
 


In [0]:
#test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)
acc = dev(model,dev_loader)


HBox(children=(IntProgress(value=1, bar_style='info', description='Dev Itreation:', max=1, style=ProgressStyle…

0

8
14


In [0]:
train_data = SpanClDataset('/content/drive/My Drive/Colab Notebooks/small_review.csv')
train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)


HBox(children=(IntProgress(value=0, description='load and process dataset：', max=4638, style=ProgressStyle(des…




HBox(children=(IntProgress(value=0, description='process text：', max=192, style=ProgressStyle(description_widt…




<torch.utils.data.dataloader.DataLoader at 0x7fd173d31f98>

In [0]:
train_data[0]

(tensor([ 3357,  2046,  1996,  6879,  1997, 12164, 29053,  1010,  2797,  2034,
          2465,  1997,  1996,  1523,  2035,  1011,  2137,  1524,  6445,  4859,
         10519,  2407,  1010,  1998,  4329,  4697,  1996,  2126,  1996,  2162,
          2003,  4061,  1012,  2013,  1037,  6857,  2927,  1999, 12071,  2000,
          2162,  3045, 10911,  2015,  1999,  1996,  2540,  1997,  2762,  1010,
          2954,  1996,  8680, 25755,  7465,  2008,  2357,  2637,  1521,  1055,
          2034, 11498, 13181, 27342,  2046,  4337,  9489,  1012,  4088,  2169,
          3260,  2013,  1996,  2250,  1998,  2369,  4099,  3210,  1010,  2059,
          5376,  3202,  2046,  1996,  2895,  1012,  2006,  1996,  2598,  1010,
         11147, 11100,  1999,  2151,  2344,  2408,  2019, 25145,  1010,  2489,
          1011, 24430,  4044,  1010,  2478,  1037,  2898,  3528,  1997, 12200,
          3085,  4255,  2000,  2115,  6143,  5056,  1012,  2330, 11686,  2015,
          7471,  4337,  8984,  6651,  9932, 12200,  

In [0]:
if __name__ == '__main__':
    batch_size = 4
    train_data = SpanClDataset('/content/drive/My Drive/Colab Notebooks/small_review.csv')
    #dev_data = SpanClDataset('/content/drive/My Drive/Colab Notebooks/small_val.csv')
    test_data = SpanClDataset('/content/drive/My Drive/Colab Notebooks/small_tes.csv')
 
 
    train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
    #print(train_loader)
    #dev_loader = DataLoader(dataset=dev_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)
 
 
    model = SpanBertClassificationModel()
    train(model,train_loader)
    # path = '/content/drive/My Drive/Colab Notebooks/span_bert_hide_model.pkl'
    # model1 = torch.load(path)
    predicts,predict_probs = predict(model,test_loader)

HBox(children=(IntProgress(value=0, description='load and process dataset：', max=4638, style=ProgressStyle(des…




HBox(children=(IntProgress(value=0, description='process text：', max=4638, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='load and process dataset：', max=450, style=ProgressStyle(desc…




HBox(children=(IntProgress(value=0, description='process text：', max=450, style=ProgressStyle(description_widt…


Training begin!


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Train Epoch[1/5],step[803/1160],tra_acc48.381071 %,loss:0.686365
Train Epoch[1/5],step[804/1160],tra_acc48.351990 %,loss:0.705681
Train Epoch[1/5],step[805/1160],tra_acc48.322981 %,loss:0.724633
Train Epoch[1/5],step[806/1160],tra_acc48.325062 %,loss:0.692907
Train Epoch[1/5],step[807/1160],tra_acc48.389095 %,loss:0.674429
Train Epoch[1/5],step[808/1160],tra_acc48.360149 %,loss:0.722148
Train Epoch[1/5],step[809/1160],tra_acc48.362176 %,loss:0.664262
Train Epoch[1/5],step[810/1160],tra_acc48.395062 %,loss:0.687046
Train Epoch[1/5],step[811/1160],tra_acc48.366215 %,loss:0.709678
Train Epoch[1/5],step[812/1160],tra_acc48.368227 %,loss:0.702790
Train Epoch[1/5],step[813/1160],tra_acc48.400984 %,loss:0.682130
Train Epoch[1/5],step[814/1160],tra_acc48.433661 %,loss:0.687976
Train Epoch[1/5],step[815/1160],tra_acc48.435583 %,loss:0.710741
Train Epoch[1/5],step[816/1160],tra_acc48.437500 %,loss:0.695748
Train Epoch[1/5],step[817/1160],tra_acc48.439412 



predict_Accuracy : 52.22222222222223 %


In [0]:
path = '/content/drive/My Drive/Colab Notebooks/span_bert_hide_model.pkl'
torch.save(model, path)

  "type " + obj.__name__ + ". It won't be checked "


In [0]:
model.eval()

SpanBertClassificationModel(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): L

In [0]:
descrip = pd.read_csv('/content/drive/My Drive/Colab Notebooks/small_review.csv')
len(descrip['descrip'].unique())

2319

In [0]:
review = descrip['review'].sample(n=2319, random_state=5)

In [0]:
review.shape

(2319,)

In [0]:
full_tokens=[]
full_ids=[]
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
game_descrip = descrip['descrip'].unique()
for i in range(len(game_descrip)):
  text=game_descrip[i].replace('*','')
  marked_text = "[CLS] " + text + " [SEP]"
  tokenized_text = tokenizer.tokenize(marked_text)
  if len(tokenized_text) > 512:
    tokenized_text = tokenized_text[:512]
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

  segments_ids = [1]* len(tokenized_text)

  full_tokens.append(indexed_tokens)

  full_ids.append(segments_ids)


In [0]:
full_tokens_b=[]
full_ids_b=[]

for i in range(len(review)):
  #text=review[i].replace('*','')
  marked_text = "[CLS] " + text + " [SEP]"
  tokenized_text = tokenizer.tokenize(marked_text)
  if len(tokenized_text) > 512:
    tokenized_text = tokenized_text[:512]
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

  segments_ids = [1]* len(tokenized_text)

  full_tokens_b.append(indexed_tokens)

  full_ids_b.append(segments_ids)

In [0]:
final=[]
from tqdm import trange
for i in trange(len(full_ids)):
  tokens_tensor = torch.tensor([full_tokens[i]]).to(device)
  segments_tensors = torch.tensor([full_ids[i]]).to(device)
  tokens_tensor_b = torch.tensor([full_tokens_b[i]]).to(device)
  segments_tensors_b = torch.tensor([full_ids_b[i]]).to(device)
  with torch.no_grad():
      encoded_layers= model(tokens_tensor, segments_tensors,tokens_tensor_b,segments_tensors_b)
  token_vecs = encoded_layers[4][0]
  sentence_embedding = torch.mean(token_vecs, dim=0)
  final.append(sentence_embedding.numpy())

len(final)

  0%|          | 0/2319 [00:00<?, ?it/s]


IndexError: ignored

In [0]:
import torch
import pickle
final = pickle.load(open('/content/drive/My Drive/Colab Notebooks/span_bert_hide_model.pkl','rb'))
final.eval()

AttributeError: ignored

In [4]:
!pip install pytorch_pretrained_bert

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 27.5MB/s eta 0:00:01[K     |█████▎                          | 20kB 33.0MB/s eta 0:00:01[K     |████████                        | 30kB 38.0MB/s eta 0:00:01[K     |██████████▋                     | 40kB 41.6MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 37.6MB/s eta 0:00:01[K     |███████████████▉                | 61kB 35.1MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 36.6MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 30.0MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 29.6MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 29.4MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 29.4MB/s eta 0:00:01[K     |████████████

In [5]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model (weights)
final = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
final.eval()

100%|██████████| 231508/231508 [00:00<00:00, 1252465.14B/s]
100%|██████████| 407873900/407873900 [00:09<00:00, 43385913.38B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [6]:
import pandas as pd
import numpy as np
descrip = pd.read_csv('/content/drive/My Drive/Colab Notebooks/des_scale.csv')
descrip.head()

Unnamed: 0,appid,descrip,num_word
0,10,Play the world's number 1 online action game. ...,48.0
1,20,One of the most popular online action games of...,53.0
2,30,Enlist in an intense brand of Axis vs. Allied ...,67.0
3,40,Enjoy fast-paced multiplayer gaming with Death...,37.0
4,50,Return to the Black Mesa Research Facility as ...,49.0


In [0]:
text = descrip['descrip'].astype('str')

In [8]:
from tqdm.auto import tqdm
full_tokens=[]
full_ids=[]
for i in tqdm(range(len(text))):
  #print(i)   
  #text=summary[i].replace('*','')
  marked_text = "[CLS] " + text[i] + " [SEP]"
  tokenized_text = tokenizer.tokenize(marked_text)
  if len(tokenized_text) > 512:
    tokenized_text = tokenized_text[:512]
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

  segments_ids = [1]* len(tokenized_text)

  full_tokens.append(indexed_tokens)

  full_ids.append(segments_ids)

HBox(children=(IntProgress(value=0, max=2994), HTML(value='')))




In [10]:
des2vec=[]
import torch
for i in tqdm(range(len(full_ids))):
  tokens_tensor = torch.tensor([full_tokens[i]])
  segments_tensors = torch.tensor([full_ids[i]])
  with torch.no_grad():
      encoded_layers, _ = final(tokens_tensor, segments_tensors)
  token_vecs = encoded_layers[11][0]
  sentence_embedding = torch.mean(token_vecs, dim=0)
  des2vec.append(sentence_embedding.numpy())

HBox(children=(IntProgress(value=0, max=2994), HTML(value='')))




In [12]:
len(des2vec)

2994

In [0]:
des2vec = pd.DataFrame({'des2vec':des2vec})

In [0]:
a = pd.concat([des2vec,descrip],axis=1)

In [0]:
a = a[['appid','des2vec']]

In [16]:
a.head()

Unnamed: 0,appid,des2vec
0,10,"[-0.1585612, -0.27853778, 0.19277042, 0.117113..."
1,20,"[-0.33341014, -0.1603566, 0.34206828, 0.043778..."
2,30,"[-0.25786415, -0.08540806, 0.2038776, 0.038228..."
3,40,"[0.10915119, -0.12645361, 0.17961764, 0.118649..."
4,50,"[-0.2700375, -0.1939105, 0.071002774, 0.130684..."


In [0]:
import pickle

pickle.dump(a, open('/content/drive/My Drive/Colab Notebooks/des2vec.pkl','wb'))

In [0]:
test = pickle.load(open('/content/drive/My Drive/Colab Notebooks/des2vec.pkl','rb'))

In [19]:
test.head()

Unnamed: 0,appid,des2vec
0,10,"[-0.1585612, -0.27853778, 0.19277042, 0.117113..."
1,20,"[-0.33341014, -0.1603566, 0.34206828, 0.043778..."
2,30,"[-0.25786415, -0.08540806, 0.2038776, 0.038228..."
3,40,"[0.10915119, -0.12645361, 0.17961764, 0.118649..."
4,50,"[-0.2700375, -0.1939105, 0.071002774, 0.130684..."


In [0]:
a.to_csv('/content/drive/My Drive/Colab Notebooks/des2vec.csv',index=False)

In [20]:
import pandas as pd
import numpy as np
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from scipy.spatial.distance import cosine
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
import pickle
a = pickle.load(open('/content/drive/My Drive/Colab Notebooks/des2vec.pkl','rb'))

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [0]:
def recomend(demand, dataframe, n):
  marked_text = "[CLS] " + demand + " [SEP]"
  tokenized_text = tokenizer.tokenize(marked_text)
  if len(tokenized_text) > 512:
    tokenized_text = tokenized_text[:512]
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  segments_ids = [1]* len(tokenized_text)
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])
  with torch.no_grad():
      encoded_layers, _ = model(tokens_tensor, segments_tensors)
  token_vecs = encoded_layers[11][0]
  sentence_embedding = torch.mean(token_vecs, dim=0)
  cos = []
  for i in range(len(dataframe)):
    tmp = cosine(sentence_embedding, dataframe.iloc[i][1])
    cos.append(tmp)
  dataframe['cos'] = cos
  dataframe.sort_values(by=['cos'],ascending=False,inplace = True)
  return dataframe[:n]['appid'].values

In [22]:
demand = 'I want a shoot game named counter-strike'
recomend(demand=demand, dataframe=test, n = 5)

array([802200,    320, 367580,    360, 423230])