In [253]:
import re
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch import tensor
from bpemb import BPEmb
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from collections import Counter
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

import dask.dataframe as dd
from dask.multiprocessing import get

In [2]:
bpemb_en = BPEmb(lang="en", dim=300)

In [None]:
issues_df = pd.read_pickle('issues_df.pkl', 'gzip')
dataset_df = pd.read_pickle('dataset_df.pkl', 'gzip')
dataset_df = dataset_df[dataset_df['tokenized'].map(lambda d: len(d)) > 0]

In [3]:
issues_df = pd.read_csv('./filtered_data/issues_data_etlegacy.csv')
dataset_df = pd.read_csv('./filtered_data/file_data_etlegacy.csv')

In [4]:
dataset_df = dataset_df.sample(frac=1).reset_index(drop=True)
issues_df = issues_df.set_index('id')

In [24]:
dataset_df.head()

Unnamed: 0,filename,file_content,issue,related,tokenized
0,g_systemmsg.c,b'/*\n * Wolfenstein: Enemy Territory GPL Sour...,332,False,"[20, 9937, 9952, 9939, 0, 9917, 62, 5707, 19, ..."
1,be_interface.c,b'/*\n * Wolfenstein: Enemy Territory GPL Sour...,244,False,"[20, 9937, 9952, 9939, 0, 9917, 62, 5707, 19, ..."
2,snd_codec_ogg.c,/**\n * Wolfenstein: Enemy Territory GPL Sourc...,610,True,"[2781, 9939, 9939, 0, 62, 5707, 19, 3521, 9948..."
3,g_missile.c,b'/*\n * Wolfenstein: Enemy Territory GPL Sour...,3,False,"[20, 9937, 9952, 9939, 0, 9917, 62, 5707, 19, ..."
4,Omni-Bot.h,b'////////////////////////////////////////////...,1247,False,"[20, 9937, 9952, 9952, 9952, 9952, 9952, 9952,..."


In [25]:
issues_df.head()

Unnamed: 0_level_0,text,tokenized
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,make cURL optional Author: @JanSimek (Radegast...,"[1166, 838, 9922, 2250, 853, 1305, 9948, 9912,..."
12,Replace Sys_OpenURL Author: @JanSimek (Radegas...,"[3675, 378, 9920, 9976, 126, 19, 67, 9922, 130..."
9,Fix compatibility with Jaymod 2.2.0 Author: @J...,"[4055, 9260, 2758, 97, 6763, 3154, 1672, 246, ..."
13,Improve error message of sv_init.c SV_TouchCGa...,"[5815, 7541, 6049, 27, 10, 9936, 9976, 6, 25, ..."
6,Remove +button4 cmd Author: @IR4T4 (IR4T4)\nDa...,"[7709, 4489, 5274, 460, 9925, 6247, 9923, 1305..."


In [7]:
dataset_df['file_content'] = dataset_df['file_content'].fillna('')
issues_df['title'] = issues_df['title'].fillna('')
issues_df['body'] = issues_df['body'].fillna('')

In [8]:
issues_df['text'] = issues_df['title'] + ' ' + issues_df['body']
issues_df = issues_df[['text']]

In [9]:
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return bpemb_en.encode_ids(text)

In [15]:
issues_df['tokenized'] = issues_df.apply(lambda row: tokenize(row['text']), axis=1)
ddata = dd.from_pandas(dataset_df, npartitions=30)
dataset_df['tokenized'] = ddata.map_partitions(lambda df: df.apply(lambda row: tokenize(row['file_content']), axis=1)).compute(scheduler='processes')  

In [None]:
vocab = set()
issues_df.apply(lambda x: vocab.update(str(x['title']).split()), axis=1)
issues_df.apply(lambda x: vocab.update(str(x['body']).split()), axis=1)
dataset_df.apply(lambda x: vocab.update(str(x['file_content']).split()), axis=1)
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [189]:
class SourceCodeDataset(Dataset):
    def __init__(self, sourceCode, issues):
        self.source_code = sourceCode
        self.issues = issues
    
    def __len__(self):
        return len(self.source_code)
    
    def __getitem__(self, idx):
        y = 1 if self.source_code.iloc[idx]['related'] == True else 0
        y = tensor(y).float()
        # Return shape : (code, issue, is_related)
        return (tensor(self.source_code.iloc[idx]['tokenized']),
                tensor(self.issues.loc[self.source_code.iloc[idx]['issue']]['tokenized'])), y

In [190]:
dataset = SourceCodeDataset(dataset_df, issues_df)
train_size = int(0.9 * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [train_size, valid_size])

In [270]:
batch_size = 16
# vocab_size = len(words)

def pad_collate(batch):
    (xx, yy) = zip(*batch)
    (x1, x2) = zip(*xx)
    
    x1_lens = [len(x) for x in x1]
    x2_lens = [len(x) for x in x2]
    
    x1_pad = pad_sequence(x1, batch_first=True, padding_value=0)
    x2_pad = pad_sequence(x2, batch_first=True, padding_value=0)
    
    return x1_pad, x2_pad, tensor(yy), x1_lens, x2_lens, len(yy)

train_dl = DataLoader(train_dataset, batch_size=batch_size, 
                      shuffle=True, collate_fn=pad_collate)

val_dl = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=pad_collate)

In [375]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class IssueLanguageModel(nn.Module):
    def __init__(self, embedding_dim=300):
        super(IssueLanguageModel, self).__init__()        
        
        self.embeddings = nn.Embedding.from_pretrained(tensor(bpemb_en.vectors))
        
        self.gru1 = nn.GRU(embedding_dim, 128, batch_first=True)
        self.gru2 = nn.GRU(embedding_dim, 256, batch_first=True)
        
        self.linear = nn.Linear(384, 128)
        self.linear2 = nn.Linear(128, 1)
        
        # Unfreeze embedding layer
        # self.embeddings.weight.requires_grad=True
    def forward(self, issue, source_code, issue_len, code_len):
        issue_em = self.embeddings(issue)
        code_em = self.embeddings(source_code)
        
        issue_packed = pack_padded_sequence(issue_em, issue_len, batch_first=True, enforce_sorted=False)
        code_packed = pack_padded_sequence(code_em, code_len, batch_first=True, enforce_sorted=False)
        
        output1, hidden1 = self.gru1(issue_packed)
        output2, hidden2 = self.gru2(code_packed)
        
        
        hidden = torch.cat((hidden1, hidden2), dim=2)
        
        out = self.linear(hidden[-1])
        out = F.relu(out)
        out = self.linear2(out)
        return out

In [376]:
model = IssueLanguageModel().to(device)
loss_function = nn.BCEWithLogitsLoss(pos_weight=tensor(2))
optimizer = optim.Adam(model.parameters(), 0.001)

In [288]:
def train(epoch):
    losses = []
    for epoch in tqdm(range(epoch)):
        model.train()
        total_loss = 0
        index = 0
        for code, issue, labels, code_len, issue_len, y_len in tqdm(train_dl):
            index += 1
            code = code.long().to(device)
            issue = issue.long().to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            log_probs = model(issue, code, issue_len, code_len)
            
            loss = loss_function(log_probs.view(-1), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
            if index%20 == 0:
                print("Loss at {}: {}".format(index, loss.item()))
            if index%50 == 0:
                validation_metrics()
        losses.append(total_loss)
        print("Loss: {}".format(sum(losses)/len(losses)))
    return losses

In [344]:
def validation_metrics():
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0
    with torch.no_grad():
        for code, issue, labels, code_len, issue_len, y_len in tqdm(val_dl):
            code = code.long().to(device)
            issue = issue.long().to(device)
            labels = labels.to(device)
            log_probs = model(issue, code, issue_len, code_len).view(-1)
            loss = loss_function(log_probs, labels)
            
            pred = torch.sigmoid(log_probs)
            correct += (pred==labels).sum()
            total += labels.shape[0]
            sum_loss += loss.item()*labels.shape[0]
            break
    return sum_loss/total, (correct/total).cpu().item() * 100

In [377]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

3643073

In [333]:
loss_function = nn.BCEWithLogitsLoss(pos_weight=tensor(3))
input = tensor([900,-900,-900,900]).float()
target = tensor([1,0,0,0]).float()
loss_function(input, target).item()

225.0

In [289]:
losses = train(200)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=200.0), HTML(value='')))

torch.Size([2, 2225]) torch.Size([2, 2225, 300]) torch.Size([1, 2, 256])
torch.Size([2, 13964]) torch.Size([2, 13964, 300]) torch.Size([1, 2, 256])
torch.Size([2, 512])
torch.Size([2, 1]) torch.Size([2])



KeyboardInterrupt: 

In [None]:
plt.plot(losses)
plt.title('Train BCE Losses')
plt.ylabel('Binary Cross Entropy')
plt.xlabel('Epoch #')
plt.legend(['Train loss'], loc='upper right')
plt.show()

In [None]:
predict = model("delete test")

In [None]:
torch.argmax(predict)

In [176]:
issues_df.to_pickle('issues_df.pkl', 'gzip', 4)

In [175]:
dataset_df.to_pickle('dataset_df.pkl', 'gzip', 4)