In [3]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
############################### imports
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn import metrics
from sklearn.svm import LinearSVC
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import seaborn as sns
import shutil, sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import torch
import os

In [4]:

import warnings

warnings.filterwarnings("ignore")


# Read CSV files to get questions and tags
df_questions = pd.read_csv("./Questions.csv", encoding="ISO-8859-1")
df_tags = pd.read_csv("./Tags.csv", encoding="ISO-8859-1", dtype={'Tag': str})


df_questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [5]:
# Group tags by id and join them
df_tags['Tag'] = df_tags['Tag'].astype(str)
grouped_tags = df_tags.groupby("Id")['Tag'].apply(lambda tags: ' '.join(tags))
# Reset index for making simpler dataframe
grouped_tags.reset_index()
grouped_tags_final = pd.DataFrame({'Id':grouped_tags.index, 'Tags':grouped_tags.values})
# Drop unnecessary columns
df_questions.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate'], inplace=True)

# Merge questions and tags into one dataframe
df = df_questions.merge(grouped_tags_final, on='Id')

In [6]:
import nltk

# Filter out questions with a score lower than 5
new_df = df[df['Score']>5]

# Split tags in order to get a list of tags
new_df['Tags'] = new_df['Tags'].apply(lambda x: x.split())
all_tags = [item for sublist in new_df['Tags'].values for item in sublist]

flat_list = [item for sublist in new_df['Tags'].values for item in sublist]

keywords = nltk.FreqDist(flat_list)
keywords = nltk.FreqDist(keywords)

# Get most frequent tags
frequencies_words = keywords.most_common(25)
tags_features = [word[0] for word in frequencies_words]
# Drop unnecessary columns at this point
new_df.drop(columns=['Id', 'Score'], inplace=True)
print(tags_features)

['c#', 'java', 'javascript', 'android', 'python', 'c++', 'php', 'jquery', '.net', 'ios', 'html', 'css', 'c', 'iphone', 'objective-c', 'ruby-on-rails', 'sql', 'asp.net', 'mysql', 'ruby', 'r', 'git', 'asp.net-mvc', 'linux', 'sql-server']


In [7]:
def most_common(tags):
    """Function to check if tag is in most common tag list"""
    tags_filtered = []
    for i in range(0, len(tags)):
        if tags[i] in tags_features:
            tags_filtered.append(tags[i])
    return tags_filtered

# Change Tags column into None for questions that don't have a most common tag
new_df['Tags'] = new_df['Tags'].apply(lambda x: most_common(x))
new_df['Tags'] = new_df['Tags'].apply(lambda x: x if len(x)>0 else None)

# Drop rows that contain None in Tags column
new_df.dropna(subset=['Tags'], inplace=True)
new_df.shape

(52418, 3)

In [8]:
#First, you're going to need to import wordnet:
import nltk

nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nisar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
from bs4 import BeautifulSoup
import lxml
import re

from nltk.corpus import stopwords
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

# Filter out HTML
new_df['Body'] = new_df['Body'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text()) 

token = ToktokTokenizer()
lemma = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def strip_list_noempty(mylist):
    newlist = (item.strip() if hasattr(item, 'strip') else item for item in mylist)
    return [item for item in newlist if item != '']

def removeStopWords(text):
    words = token.tokenize(text)
    filtered = [w for w in words if not w in stop_words]
    return ' '.join(map(str, filtered))

def removePunctuation(text):
    punct = '!"$%&\'()*,./:;<=>?@[\\]^_`{|}~'
    words=token.tokenize(text)
    punctuation_filtered = []
    regex = re.compile('[%s]' % re.escape(punct))
    remove_punctuation = str.maketrans(' ', ' ', punct)
    for w in words:
        if w in tags_features:
            punctuation_filtered.append(w)
        else:
            punctuation_filtered.append(regex.sub('', w))
  
    filtered_list = strip_list_noempty(punctuation_filtered)
        
    return ' '.join(map(str, filtered_list))

def lemmatizeWords(text):
    words=token.tokenize(text)
    listLemma=[]
    for w in words:
        x=lemma.lemmatize(w, pos="v")
        listLemma.append(x.lower())
    return ' '.join(map(str, listLemma))


# Remove stopwords, punctuation and lemmatize for text in body
new_df['Body'] = new_df['Body'].apply(lambda x: removeStopWords(x))
new_df['Body'] = new_df['Body'].apply(lambda x: removePunctuation(x))
new_df['Body'] = new_df['Body'].apply(lambda x: lemmatizeWords(x))

# Remove stopwords, punctuation and lemmatize for title. Also weight title 3 times
new_df['Title'] = new_df['Title'].apply(lambda x: str(x)) 
new_df['Title'] = new_df['Title'].apply(lambda x: removePunctuation(x)) 
new_df['Title'] = new_df['Title'].apply(lambda x: removeStopWords(x)) 
new_df['Title'] = new_df['Title'].apply(lambda x: lemmatizeWords(x)) 
new_df['Title'] = new_df['Title'].apply(lambda x: ' '.join(x.split()*3))
new_df['Title']

2          aspnet site maps aspnet site maps aspnet site ...
4          adding script functionality net applications a...
5          should i use nest class case should i use nest...
6          homegrown consumption web service homegrown co...
7          deploying sql server databases test live deplo...
                                 ...                        
1262668    using lambda default initializer gcc vs clang ...
1262834    stl list bad performance stl list bad performa...
1262915    how use dict subset dataframe how use dict sub...
1263065    is way use itertools python clean nest iterati...
1263454    why result data return void get break why resu...
Name: Title, Length: 52418, dtype: object

In [10]:
############################### CONFIG
MAX_LEN = 225
TRAIN_BATCH_SIZE = 36
VALID_BATCH_SIZE = 36
EPOCHS = 5
LEARNING_RATE = 1e-05

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

In [11]:
################################ INIT
warnings.simplefilter("ignore")
sns.set_style("darkgrid")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [12]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3050 Laptop GPU


In [13]:
new_df.head()

Unnamed: 0,Title,Body,Tags
2,aspnet site maps aspnet site maps aspnet site maps,has anyone get experience create sql-based aspnet site-map providers i get default xml file websitemap work properly menu sitemappath control i need way users site create modify page dynamically i need tie page view permissions standard aspnet membership system well,"[sql, asp.net]"
4,adding script functionality net applications adding script functionality net applications adding script functionality net applications,i little game write c# it use database back-end it trade card game i want implement function card script what i mean i essentially interface icard card class implement public class card056 icard contain function call game now make thing maintainablemoddable i would like class card source code database essentially compile first use so i addchange card i add database tell application refresh without need assembly deployment especially since would talk 1 assembly per card mean hundreds assemblies is possible register class source file instantiate etc icard cards current new mygamecardlibrarycard056 cards current onenterplay ref currentgamestate the language c# extra bonus possible write script net language,"[c#, .net]"
5,should i use nest class case should i use nest class case should i use nest class case,i work collection class use video playback record i one main class act like public interface methods like play stop pause record etc then i workhorse class video decode video encode i learn existence nest class c++ i curious know programmers think use them i little wary really sure benefitsdrawbacks seem accord book i read use case mine the book suggest scenario like mine good solution would nest workhorse class inside interface class separate file class client mean use avoid possible name conflict i know justifications nested class new concept me just want see programmers think issue,[c++]
6,homegrown consumption web service homegrown consumption web service homegrown consumption web service,i write web service .net app i ready consume them i see numerous examples homegrown code consume service oppose use auto generate methods visual studio create add web reference is advantage,[.net]
7,deploying sql server databases test live deploying sql server databases test live deploying sql server databases test live,i wonder guy manage deployment database 2 sql servers specifically sql server 2005 now development live one as part buildscript standard windows batch even current complexity script might switch powershell later enterprise managermanagement studio express count would copy mdf file attach i always bite careful work binary data seem compatiblity issue even though development live run version server time or - give lack explain create table t-sql - something export exist database sql-scripts run target server if yes tool automatically dump give database sql queries run command line again enterprise managermanagement studio express count and lastly - give fact live database already contain data deployment may involve create table rather check difference structure alter table live ones instead may also need data verificationconversion exist field change now hear lot great stuff red gate products hobby project price bite steep so use automatically deploy sql server databases test live,[sql-server]


In [14]:
new_df['Combo'] = new_df['Title'] + ". " + new_df['Body']

In [15]:
new_df.head()

Unnamed: 0,Title,Body,Tags,Combo
2,aspnet site maps aspnet site maps aspnet site maps,has anyone get experience create sql-based aspnet site-map providers i get default xml file websitemap work properly menu sitemappath control i need way users site create modify page dynamically i need tie page view permissions standard aspnet membership system well,"[sql, asp.net]",aspnet site maps aspnet site maps aspnet site maps. has anyone get experience create sql-based aspnet site-map providers i get default xml file websitemap work properly menu sitemappath control i need way users site create modify page dynamically i need tie page view permissions standard aspnet membership system well
4,adding script functionality net applications adding script functionality net applications adding script functionality net applications,i little game write c# it use database back-end it trade card game i want implement function card script what i mean i essentially interface icard card class implement public class card056 icard contain function call game now make thing maintainablemoddable i would like class card source code database essentially compile first use so i addchange card i add database tell application refresh without need assembly deployment especially since would talk 1 assembly per card mean hundreds assemblies is possible register class source file instantiate etc icard cards current new mygamecardlibrarycard056 cards current onenterplay ref currentgamestate the language c# extra bonus possible write script net language,"[c#, .net]",adding script functionality net applications adding script functionality net applications adding script functionality net applications. i little game write c# it use database back-end it trade card game i want implement function card script what i mean i essentially interface icard card class implement public class card056 icard contain function call game now make thing maintainablemoddable i would like class card source code database essentially compile first use so i addchange card i add database tell application refresh without need assembly deployment especially since would talk 1 assembly per card mean hundreds assemblies is possible register class source file instantiate etc icard cards current new mygamecardlibrarycard056 cards current onenterplay ref currentgamestate the language c# extra bonus possible write script net language
5,should i use nest class case should i use nest class case should i use nest class case,i work collection class use video playback record i one main class act like public interface methods like play stop pause record etc then i workhorse class video decode video encode i learn existence nest class c++ i curious know programmers think use them i little wary really sure benefitsdrawbacks seem accord book i read use case mine the book suggest scenario like mine good solution would nest workhorse class inside interface class separate file class client mean use avoid possible name conflict i know justifications nested class new concept me just want see programmers think issue,[c++],should i use nest class case should i use nest class case should i use nest class case. i work collection class use video playback record i one main class act like public interface methods like play stop pause record etc then i workhorse class video decode video encode i learn existence nest class c++ i curious know programmers think use them i little wary really sure benefitsdrawbacks seem accord book i read use case mine the book suggest scenario like mine good solution would nest workhorse class inside interface class separate file class client mean use avoid possible name conflict i know justifications nested class new concept me just want see programmers think issue
6,homegrown consumption web service homegrown consumption web service homegrown consumption web service,i write web service .net app i ready consume them i see numerous examples homegrown code consume service oppose use auto generate methods visual studio create add web reference is advantage,[.net],homegrown consumption web service homegrown consumption web service homegrown consumption web service. i write web service .net app i ready consume them i see numerous examples homegrown code consume service oppose use auto generate methods visual studio create add web reference is advantage
7,deploying sql server databases test live deploying sql server databases test live deploying sql server databases test live,i wonder guy manage deployment database 2 sql servers specifically sql server 2005 now development live one as part buildscript standard windows batch even current complexity script might switch powershell later enterprise managermanagement studio express count would copy mdf file attach i always bite careful work binary data seem compatiblity issue even though development live run version server time or - give lack explain create table t-sql - something export exist database sql-scripts run target server if yes tool automatically dump give database sql queries run command line again enterprise managermanagement studio express count and lastly - give fact live database already contain data deployment may involve create table rather check difference structure alter table live ones instead may also need data verificationconversion exist field change now hear lot great stuff red gate products hobby project price bite steep so use automatically deploy sql server databases test live,[sql-server],deploying sql server databases test live deploying sql server databases test live deploying sql server databases test live. i wonder guy manage deployment database 2 sql servers specifically sql server 2005 now development live one as part buildscript standard windows batch even current complexity script might switch powershell later enterprise managermanagement studio express count would copy mdf file attach i always bite careful work binary data seem compatiblity issue even though development live run version server time or - give lack explain create table t-sql - something export exist database sql-scripts run target server if yes tool automatically dump give database sql queries run command line again enterprise managermanagement studio express count and lastly - give fact live database already contain data deployment may involve create table rather check difference structure alter table live ones instead may also need data verificationconversion exist field change now hear lot great stuff red gate products hobby project price bite steep so use automatically deploy sql server databases test live


In [16]:
############################### BINARIZATION
mlb = MultiLabelBinarizer()
tag_df = pd.DataFrame(mlb.fit_transform(new_df['Tags']), columns=mlb.classes_, index=new_df.index)
class_names = mlb.classes_

In [17]:
class_names

array(['.net', 'android', 'asp.net', 'asp.net-mvc', 'c', 'c#', 'c++',
       'css', 'git', 'html', 'ios', 'iphone', 'java', 'javascript',
       'jquery', 'linux', 'mysql', 'objective-c', 'php', 'python', 'r',
       'ruby', 'ruby-on-rails', 'sql', 'sql-server'], dtype=object)

In [19]:
len(class_names)

25

In [18]:
df = new_df
df.head()

Unnamed: 0,Title,Body,Tags,Combo
2,aspnet site maps aspnet site maps aspnet site maps,has anyone get experience create sql-based aspnet site-map providers i get default xml file websitemap work properly menu sitemappath control i need way users site create modify page dynamically i need tie page view permissions standard aspnet membership system well,"[sql, asp.net]",aspnet site maps aspnet site maps aspnet site maps. has anyone get experience create sql-based aspnet site-map providers i get default xml file websitemap work properly menu sitemappath control i need way users site create modify page dynamically i need tie page view permissions standard aspnet membership system well
4,adding script functionality net applications adding script functionality net applications adding script functionality net applications,i little game write c# it use database back-end it trade card game i want implement function card script what i mean i essentially interface icard card class implement public class card056 icard contain function call game now make thing maintainablemoddable i would like class card source code database essentially compile first use so i addchange card i add database tell application refresh without need assembly deployment especially since would talk 1 assembly per card mean hundreds assemblies is possible register class source file instantiate etc icard cards current new mygamecardlibrarycard056 cards current onenterplay ref currentgamestate the language c# extra bonus possible write script net language,"[c#, .net]",adding script functionality net applications adding script functionality net applications adding script functionality net applications. i little game write c# it use database back-end it trade card game i want implement function card script what i mean i essentially interface icard card class implement public class card056 icard contain function call game now make thing maintainablemoddable i would like class card source code database essentially compile first use so i addchange card i add database tell application refresh without need assembly deployment especially since would talk 1 assembly per card mean hundreds assemblies is possible register class source file instantiate etc icard cards current new mygamecardlibrarycard056 cards current onenterplay ref currentgamestate the language c# extra bonus possible write script net language
5,should i use nest class case should i use nest class case should i use nest class case,i work collection class use video playback record i one main class act like public interface methods like play stop pause record etc then i workhorse class video decode video encode i learn existence nest class c++ i curious know programmers think use them i little wary really sure benefitsdrawbacks seem accord book i read use case mine the book suggest scenario like mine good solution would nest workhorse class inside interface class separate file class client mean use avoid possible name conflict i know justifications nested class new concept me just want see programmers think issue,[c++],should i use nest class case should i use nest class case should i use nest class case. i work collection class use video playback record i one main class act like public interface methods like play stop pause record etc then i workhorse class video decode video encode i learn existence nest class c++ i curious know programmers think use them i little wary really sure benefitsdrawbacks seem accord book i read use case mine the book suggest scenario like mine good solution would nest workhorse class inside interface class separate file class client mean use avoid possible name conflict i know justifications nested class new concept me just want see programmers think issue
6,homegrown consumption web service homegrown consumption web service homegrown consumption web service,i write web service .net app i ready consume them i see numerous examples homegrown code consume service oppose use auto generate methods visual studio create add web reference is advantage,[.net],homegrown consumption web service homegrown consumption web service homegrown consumption web service. i write web service .net app i ready consume them i see numerous examples homegrown code consume service oppose use auto generate methods visual studio create add web reference is advantage
7,deploying sql server databases test live deploying sql server databases test live deploying sql server databases test live,i wonder guy manage deployment database 2 sql servers specifically sql server 2005 now development live one as part buildscript standard windows batch even current complexity script might switch powershell later enterprise managermanagement studio express count would copy mdf file attach i always bite careful work binary data seem compatiblity issue even though development live run version server time or - give lack explain create table t-sql - something export exist database sql-scripts run target server if yes tool automatically dump give database sql queries run command line again enterprise managermanagement studio express count and lastly - give fact live database already contain data deployment may involve create table rather check difference structure alter table live ones instead may also need data verificationconversion exist field change now hear lot great stuff red gate products hobby project price bite steep so use automatically deploy sql server databases test live,[sql-server],deploying sql server databases test live deploying sql server databases test live deploying sql server databases test live. i wonder guy manage deployment database 2 sql servers specifically sql server 2005 now development live one as part buildscript standard windows batch even current complexity script might switch powershell later enterprise managermanagement studio express count would copy mdf file attach i always bite careful work binary data seem compatiblity issue even though development live run version server time or - give lack explain create table t-sql - something export exist database sql-scripts run target server if yes tool automatically dump give database sql queries run command line again enterprise managermanagement studio express count and lastly - give fact live database already contain data deployment may involve create table rather check difference structure alter table live ones instead may also need data verificationconversion exist field change now hear lot great stuff red gate products hobby project price bite steep so use automatically deploy sql server databases test live


In [20]:
############################### DATAFRAME HOUSEKEEPING
df = df.join(tag_df)
df = df.drop(columns='Tags')
df.head()

Unnamed: 0,Title,Body,Combo,.net,android,asp.net,asp.net-mvc,c,c#,c++,css,git,html,ios,iphone,java,javascript,jquery,linux,mysql,objective-c,php,python,r,ruby,ruby-on-rails,sql,sql-server
2,aspnet site maps aspnet site maps aspnet site maps,has anyone get experience create sql-based aspnet site-map providers i get default xml file websitemap work properly menu sitemappath control i need way users site create modify page dynamically i need tie page view permissions standard aspnet membership system well,aspnet site maps aspnet site maps aspnet site maps. has anyone get experience create sql-based aspnet site-map providers i get default xml file websitemap work properly menu sitemappath control i need way users site create modify page dynamically i need tie page view permissions standard aspnet membership system well,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,adding script functionality net applications adding script functionality net applications adding script functionality net applications,i little game write c# it use database back-end it trade card game i want implement function card script what i mean i essentially interface icard card class implement public class card056 icard contain function call game now make thing maintainablemoddable i would like class card source code database essentially compile first use so i addchange card i add database tell application refresh without need assembly deployment especially since would talk 1 assembly per card mean hundreds assemblies is possible register class source file instantiate etc icard cards current new mygamecardlibrarycard056 cards current onenterplay ref currentgamestate the language c# extra bonus possible write script net language,adding script functionality net applications adding script functionality net applications adding script functionality net applications. i little game write c# it use database back-end it trade card game i want implement function card script what i mean i essentially interface icard card class implement public class card056 icard contain function call game now make thing maintainablemoddable i would like class card source code database essentially compile first use so i addchange card i add database tell application refresh without need assembly deployment especially since would talk 1 assembly per card mean hundreds assemblies is possible register class source file instantiate etc icard cards current new mygamecardlibrarycard056 cards current onenterplay ref currentgamestate the language c# extra bonus possible write script net language,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,should i use nest class case should i use nest class case should i use nest class case,i work collection class use video playback record i one main class act like public interface methods like play stop pause record etc then i workhorse class video decode video encode i learn existence nest class c++ i curious know programmers think use them i little wary really sure benefitsdrawbacks seem accord book i read use case mine the book suggest scenario like mine good solution would nest workhorse class inside interface class separate file class client mean use avoid possible name conflict i know justifications nested class new concept me just want see programmers think issue,should i use nest class case should i use nest class case should i use nest class case. i work collection class use video playback record i one main class act like public interface methods like play stop pause record etc then i workhorse class video decode video encode i learn existence nest class c++ i curious know programmers think use them i little wary really sure benefitsdrawbacks seem accord book i read use case mine the book suggest scenario like mine good solution would nest workhorse class inside interface class separate file class client mean use avoid possible name conflict i know justifications nested class new concept me just want see programmers think issue,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,homegrown consumption web service homegrown consumption web service homegrown consumption web service,i write web service .net app i ready consume them i see numerous examples homegrown code consume service oppose use auto generate methods visual studio create add web reference is advantage,homegrown consumption web service homegrown consumption web service homegrown consumption web service. i write web service .net app i ready consume them i see numerous examples homegrown code consume service oppose use auto generate methods visual studio create add web reference is advantage,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,deploying sql server databases test live deploying sql server databases test live deploying sql server databases test live,i wonder guy manage deployment database 2 sql servers specifically sql server 2005 now development live one as part buildscript standard windows batch even current complexity script might switch powershell later enterprise managermanagement studio express count would copy mdf file attach i always bite careful work binary data seem compatiblity issue even though development live run version server time or - give lack explain create table t-sql - something export exist database sql-scripts run target server if yes tool automatically dump give database sql queries run command line again enterprise managermanagement studio express count and lastly - give fact live database already contain data deployment may involve create table rather check difference structure alter table live ones instead may also need data verificationconversion exist field change now hear lot great stuff red gate products hobby project price bite steep so use automatically deploy sql server databases test live,deploying sql server databases test live deploying sql server databases test live deploying sql server databases test live. i wonder guy manage deployment database 2 sql servers specifically sql server 2005 now development live one as part buildscript standard windows batch even current complexity script might switch powershell later enterprise managermanagement studio express count would copy mdf file attach i always bite careful work binary data seem compatiblity issue even though development live run version server time or - give lack explain create table t-sql - something export exist database sql-scripts run target server if yes tool automatically dump give database sql queries run command line again enterprise managermanagement studio express count and lastly - give fact live database already contain data deployment may involve create table rather check difference structure alter table live ones instead may also need data verificationconversion exist field change now hear lot great stuff red gate products hobby project price bite steep so use automatically deploy sql server databases test live,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [21]:
len(df.columns)

28

In [22]:
df['target_list'] = df.iloc[:, 3:].values.tolist()

In [23]:
df.head()

Unnamed: 0,Title,Body,Combo,.net,android,asp.net,asp.net-mvc,c,c#,c++,css,git,html,ios,iphone,java,javascript,jquery,linux,mysql,objective-c,php,python,r,ruby,ruby-on-rails,sql,sql-server,target_list
2,aspnet site maps aspnet site maps aspnet site maps,has anyone get experience create sql-based aspnet site-map providers i get default xml file websitemap work properly menu sitemappath control i need way users site create modify page dynamically i need tie page view permissions standard aspnet membership system well,aspnet site maps aspnet site maps aspnet site maps. has anyone get experience create sql-based aspnet site-map providers i get default xml file websitemap work properly menu sitemappath control i need way users site create modify page dynamically i need tie page view permissions standard aspnet membership system well,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
4,adding script functionality net applications adding script functionality net applications adding script functionality net applications,i little game write c# it use database back-end it trade card game i want implement function card script what i mean i essentially interface icard card class implement public class card056 icard contain function call game now make thing maintainablemoddable i would like class card source code database essentially compile first use so i addchange card i add database tell application refresh without need assembly deployment especially since would talk 1 assembly per card mean hundreds assemblies is possible register class source file instantiate etc icard cards current new mygamecardlibrarycard056 cards current onenterplay ref currentgamestate the language c# extra bonus possible write script net language,adding script functionality net applications adding script functionality net applications adding script functionality net applications. i little game write c# it use database back-end it trade card game i want implement function card script what i mean i essentially interface icard card class implement public class card056 icard contain function call game now make thing maintainablemoddable i would like class card source code database essentially compile first use so i addchange card i add database tell application refresh without need assembly deployment especially since would talk 1 assembly per card mean hundreds assemblies is possible register class source file instantiate etc icard cards current new mygamecardlibrarycard056 cards current onenterplay ref currentgamestate the language c# extra bonus possible write script net language,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,should i use nest class case should i use nest class case should i use nest class case,i work collection class use video playback record i one main class act like public interface methods like play stop pause record etc then i workhorse class video decode video encode i learn existence nest class c++ i curious know programmers think use them i little wary really sure benefitsdrawbacks seem accord book i read use case mine the book suggest scenario like mine good solution would nest workhorse class inside interface class separate file class client mean use avoid possible name conflict i know justifications nested class new concept me just want see programmers think issue,should i use nest class case should i use nest class case should i use nest class case. i work collection class use video playback record i one main class act like public interface methods like play stop pause record etc then i workhorse class video decode video encode i learn existence nest class c++ i curious know programmers think use them i little wary really sure benefitsdrawbacks seem accord book i read use case mine the book suggest scenario like mine good solution would nest workhorse class inside interface class separate file class client mean use avoid possible name conflict i know justifications nested class new concept me just want see programmers think issue,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6,homegrown consumption web service homegrown consumption web service homegrown consumption web service,i write web service .net app i ready consume them i see numerous examples homegrown code consume service oppose use auto generate methods visual studio create add web reference is advantage,homegrown consumption web service homegrown consumption web service homegrown consumption web service. i write web service .net app i ready consume them i see numerous examples homegrown code consume service oppose use auto generate methods visual studio create add web reference is advantage,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,deploying sql server databases test live deploying sql server databases test live deploying sql server databases test live,i wonder guy manage deployment database 2 sql servers specifically sql server 2005 now development live one as part buildscript standard windows batch even current complexity script might switch powershell later enterprise managermanagement studio express count would copy mdf file attach i always bite careful work binary data seem compatiblity issue even though development live run version server time or - give lack explain create table t-sql - something export exist database sql-scripts run target server if yes tool automatically dump give database sql queries run command line again enterprise managermanagement studio express count and lastly - give fact live database already contain data deployment may involve create table rather check difference structure alter table live ones instead may also need data verificationconversion exist field change now hear lot great stuff red gate products hobby project price bite steep so use automatically deploy sql server databases test live,deploying sql server databases test live deploying sql server databases test live deploying sql server databases test live. i wonder guy manage deployment database 2 sql servers specifically sql server 2005 now development live one as part buildscript standard windows batch even current complexity script might switch powershell later enterprise managermanagement studio express count would copy mdf file attach i always bite careful work binary data seem compatiblity issue even though development live run version server time or - give lack explain create table t-sql - something export exist database sql-scripts run target server if yes tool automatically dump give database sql queries run command line again enterprise managermanagement studio express count and lastly - give fact live database already contain data deployment may involve create table rather check difference structure alter table live ones instead may also need data verificationconversion exist field change now hear lot great stuff red gate products hobby project price bite steep so use automatically deploy sql server databases test live,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"


In [24]:
df = df.drop(df.columns[3:-1], axis=1)
df.head()

Unnamed: 0,Title,Body,Combo,target_list
2,aspnet site maps aspnet site maps aspnet site maps,has anyone get experience create sql-based aspnet site-map providers i get default xml file websitemap work properly menu sitemappath control i need way users site create modify page dynamically i need tie page view permissions standard aspnet membership system well,aspnet site maps aspnet site maps aspnet site maps. has anyone get experience create sql-based aspnet site-map providers i get default xml file websitemap work properly menu sitemappath control i need way users site create modify page dynamically i need tie page view permissions standard aspnet membership system well,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
4,adding script functionality net applications adding script functionality net applications adding script functionality net applications,i little game write c# it use database back-end it trade card game i want implement function card script what i mean i essentially interface icard card class implement public class card056 icard contain function call game now make thing maintainablemoddable i would like class card source code database essentially compile first use so i addchange card i add database tell application refresh without need assembly deployment especially since would talk 1 assembly per card mean hundreds assemblies is possible register class source file instantiate etc icard cards current new mygamecardlibrarycard056 cards current onenterplay ref currentgamestate the language c# extra bonus possible write script net language,adding script functionality net applications adding script functionality net applications adding script functionality net applications. i little game write c# it use database back-end it trade card game i want implement function card script what i mean i essentially interface icard card class implement public class card056 icard contain function call game now make thing maintainablemoddable i would like class card source code database essentially compile first use so i addchange card i add database tell application refresh without need assembly deployment especially since would talk 1 assembly per card mean hundreds assemblies is possible register class source file instantiate etc icard cards current new mygamecardlibrarycard056 cards current onenterplay ref currentgamestate the language c# extra bonus possible write script net language,"[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,should i use nest class case should i use nest class case should i use nest class case,i work collection class use video playback record i one main class act like public interface methods like play stop pause record etc then i workhorse class video decode video encode i learn existence nest class c++ i curious know programmers think use them i little wary really sure benefitsdrawbacks seem accord book i read use case mine the book suggest scenario like mine good solution would nest workhorse class inside interface class separate file class client mean use avoid possible name conflict i know justifications nested class new concept me just want see programmers think issue,should i use nest class case should i use nest class case should i use nest class case. i work collection class use video playback record i one main class act like public interface methods like play stop pause record etc then i workhorse class video decode video encode i learn existence nest class c++ i curious know programmers think use them i little wary really sure benefitsdrawbacks seem accord book i read use case mine the book suggest scenario like mine good solution would nest workhorse class inside interface class separate file class client mean use avoid possible name conflict i know justifications nested class new concept me just want see programmers think issue,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6,homegrown consumption web service homegrown consumption web service homegrown consumption web service,i write web service .net app i ready consume them i see numerous examples homegrown code consume service oppose use auto generate methods visual studio create add web reference is advantage,homegrown consumption web service homegrown consumption web service homegrown consumption web service. i write web service .net app i ready consume them i see numerous examples homegrown code consume service oppose use auto generate methods visual studio create add web reference is advantage,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,deploying sql server databases test live deploying sql server databases test live deploying sql server databases test live,i wonder guy manage deployment database 2 sql servers specifically sql server 2005 now development live one as part buildscript standard windows batch even current complexity script might switch powershell later enterprise managermanagement studio express count would copy mdf file attach i always bite careful work binary data seem compatiblity issue even though development live run version server time or - give lack explain create table t-sql - something export exist database sql-scripts run target server if yes tool automatically dump give database sql queries run command line again enterprise managermanagement studio express count and lastly - give fact live database already contain data deployment may involve create table rather check difference structure alter table live ones instead may also need data verificationconversion exist field change now hear lot great stuff red gate products hobby project price bite steep so use automatically deploy sql server databases test live,deploying sql server databases test live deploying sql server databases test live deploying sql server databases test live. i wonder guy manage deployment database 2 sql servers specifically sql server 2005 now development live one as part buildscript standard windows batch even current complexity script might switch powershell later enterprise managermanagement studio express count would copy mdf file attach i always bite careful work binary data seem compatiblity issue even though development live run version server time or - give lack explain create table t-sql - something export exist database sql-scripts run target server if yes tool automatically dump give database sql queries run command line again enterprise managermanagement studio express count and lastly - give fact live database already contain data deployment may involve create table rather check difference structure alter table live ones instead may also need data verificationconversion exist field change now hear lot great stuff red gate products hobby project price bite steep so use automatically deploy sql server databases test live,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"


In [25]:
df = df.drop(df.columns[0:2], axis=1)
df.head()

Unnamed: 0,Combo,target_list
2,aspnet site maps aspnet site maps aspnet site maps. has anyone get experience create sql-based aspnet site-map providers i get default xml file websitemap work properly menu sitemappath control i need way users site create modify page dynamically i need tie page view permissions standard aspnet membership system well,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
4,adding script functionality net applications adding script functionality net applications adding script functionality net applications. i little game write c# it use database back-end it trade card game i want implement function card script what i mean i essentially interface icard card class implement public class card056 icard contain function call game now make thing maintainablemoddable i would like class card source code database essentially compile first use so i addchange card i add database tell application refresh without need assembly deployment especially since would talk 1 assembly per card mean hundreds assemblies is possible register class source file instantiate etc icard cards current new mygamecardlibrarycard056 cards current onenterplay ref currentgamestate the language c# extra bonus possible write script net language,"[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,should i use nest class case should i use nest class case should i use nest class case. i work collection class use video playback record i one main class act like public interface methods like play stop pause record etc then i workhorse class video decode video encode i learn existence nest class c++ i curious know programmers think use them i little wary really sure benefitsdrawbacks seem accord book i read use case mine the book suggest scenario like mine good solution would nest workhorse class inside interface class separate file class client mean use avoid possible name conflict i know justifications nested class new concept me just want see programmers think issue,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6,homegrown consumption web service homegrown consumption web service homegrown consumption web service. i write web service .net app i ready consume them i see numerous examples homegrown code consume service oppose use auto generate methods visual studio create add web reference is advantage,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,deploying sql server databases test live deploying sql server databases test live deploying sql server databases test live. i wonder guy manage deployment database 2 sql servers specifically sql server 2005 now development live one as part buildscript standard windows batch even current complexity script might switch powershell later enterprise managermanagement studio express count would copy mdf file attach i always bite careful work binary data seem compatiblity issue even though development live run version server time or - give lack explain create table t-sql - something export exist database sql-scripts run target server if yes tool automatically dump give database sql queries run command line again enterprise managermanagement studio express count and lastly - give fact live database already contain data deployment may involve create table rather check difference structure alter table live ones instead may also need data verificationconversion exist field change now hear lot great stuff red gate products hobby project price bite steep so use automatically deploy sql server databases test live,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"


In [26]:
############################### SPLIT
# cross checking that my train and test split is exatcly the same with
# the train test split i did for the model A.
# reason for the check is the difference in data structures (df vs array)
# (for example pd.sample(random_state=0) returns different split than sklearn for the same state)

# so the split is 80/20 for train-val/test
# and another 80/20 for train/val
# so train: 72%, val 8%, and test 20%

# Splitting the dataframe
train_dataset, test_dataset = train_test_split(df, test_size=0.2, random_state=0)
train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.2, random_state=0)

train_dataset = train_dataset.reset_index(drop=True)
val_dataset = val_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)

# Xy_train, Xy_test = train_test_split(df, test_size = 0.2, random_state = 0)
# Xy_train, Xy_val = train_test_split(Xy_train, test_size = 0.1, random_state = 0)

# # Resetting the indices
# train_dataset = test_dataset.reset_index(drop=True)
# val_dataset = valid_dataset.reset_index(drop=True)


# DEBUG
# print(Xy_train.head(1))
# print(Xy_test.head(1))
# quit()

print("[PROGRAM]: full-set shape: {}".format(df.shape))
print("[PROGRAM]: train-set shape: {}".format(train_dataset.shape))
print("[PROGRAM]: val-set shape: {}".format(val_dataset.shape))
print("[PROGRAM]: test-set shape: {}".format(test_dataset.shape))

[PROGRAM]: full-set shape: (52418, 2)
[PROGRAM]: train-set shape: (33547, 2)
[PROGRAM]: val-set shape: (8387, 2)
[PROGRAM]: test-set shape: (10484, 2)


In [27]:
############################### TORCH DATASET
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.combo = dataframe['Combo']
        self.targets = self.data.target_list
        self.max_len = max_len

    def __len__(self):
        return len(self.combo)

    def __getitem__(self, index):
        combo = str(self.combo[index])
        combo = " ".join(combo.split())

        inputs = self.tokenizer.encode_plus(
            combo,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


train_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN)
test_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

# DEBUG
# print(train_set[0])

In [28]:
############################### TORCH DATALOADER
training_loader = DataLoader(train_set, **train_params)
validation_loader = DataLoader(val_set, **test_params)
test_loader = DataLoader(test_set, **test_params)

len(training_loader)

932

In [29]:
############################### TRAIN FUNCS
# chckpoint and save funcs from here
# https://towardsdatascience.com/how-to-save-and-load-a-model-in-pytorch-with-a-complete-example-c2920e617dee

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

def load_ckp(checkpoint_fpath, model, optimizer):
    # load check point
    # initialize state_dict from checkpoint to model
    # initialize optimizer from checkpoint to optimizer
    # initialize valid_loss_min from checkpoint to valid_loss_min
    # return model, optimizer, epoch value, min validation loss

    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    # save checkpoint data to the path given, checkpoint_path
    # if it is a best model, min validation loss
    # copy that checkpoint file to best path given, best_model_path

    f_path = checkpoint_path
    torch.save(state, f_path)
    if is_best:
        best_fpath = best_model_path
        shutil.copyfile(f_path, best_fpath)

In [30]:
############################### MODEL
# base : bert
# extra dropout + linear layer
# ending in 100 neurons, just like our classes
# after i extract the propabillities of each of the 100 neurons
# i select the proba >0.5 and bin the results to (0,1) (like sigmoid but manual)

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 25)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)
print("[INFO]: model loaded to device")



[INFO]: model loaded to device


In [31]:
############################### LOOP
# globals
val_targets=[]
val_outputs=[]
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

print("[INFO]: starting training")
def train_model(start_epochs,  n_epochs, valid_loss_min_input,
                training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path):

    # initialize tracker for minimum validation loss
    valid_loss_min = valid_loss_min_input

    for epoch in range(start_epochs, n_epochs+1):
        train_loss = 0
        valid_loss = 0

        model.train()
        print('[PROGRAM]: epoch', epoch)
        print('[PROGRAM]: TRAINING START')
        for batch_idx, data in enumerate(training_loader):
            print('[PROGRAM]: TRAINING batch ', batch_idx, ' /932')
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))

        model.eval()
        print('[PROGRAM]: epoch', epoch)
        print('[PROGRAM]: VALIDATION START')

        with torch.no_grad():
            for batch_idx, data in enumerate(validation_loader, 0):
                print('[PROGRAM]: VALIDATION batch ', batch_idx)
                ids = data['ids'].to(device, dtype = torch.long)
                mask = data['mask'].to(device, dtype = torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.float)
                outputs = model(ids, mask, token_type_ids)
                loss = loss_fn(outputs, targets)
                valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
                val_targets.extend(targets.cpu().detach().numpy().tolist())
                val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

            print('[PROGRAM]: VALIDATION END')
            # calculate average losses
            train_loss = train_loss/len(training_loader)
            valid_loss = valid_loss/len(validation_loader)
            print('[PROGRAM]: Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
                epoch,
                train_loss,
                valid_loss
                ))

            # create checkpoint variable and add important data
            checkpoint = {
                'epoch': epoch + 1,
                'valid_loss_min': valid_loss,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
            }

            # save checkpoint
            save_ckp(checkpoint, False, checkpoint_path, best_model_path)
            if valid_loss <= valid_loss_min:
                print('[PROGRAM]: Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
                # save checkpoint as best model
                save_ckp(checkpoint, True, checkpoint_path, best_model_path)
                valid_loss_min = valid_loss

        print('[PROGRAM]: Epoch {}  Done \n'.format(epoch))


    return model

[INFO]: starting training


In [32]:
############################### TRAIN FUNCS
# chckpoint and save funcs from here (joe)
# https://towardsdatascience.com/how-to-save-and-load-a-model-in-pytorch-with-a-complete-example-c2920e617dee

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

def load_ckp(checkpoint_fpath, model, optimizer):
    # load check point
    # initialize state_dict from checkpoint to model
    # initialize optimizer from checkpoint to optimizer
    # initialize valid_loss_min from checkpoint to valid_loss_min
    # return model, optimizer, epoch value, min validation loss

    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min

In [33]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 25)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
model,_,_,_ = load_ckp("best_model.pt", model, optimizer)
model.to(device)
print("[INFO]: BERT finetuned model loaded from best checkpoint")
print("[INFO]: model loaded to device")



[INFO]: BERT finetuned model loaded from best checkpoint
[INFO]: model loaded to device


In [34]:
def score_avg(y_pred, y_test):
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')
    hamming = hamming_loss(y_test, y_pred)
    jacard = jaccard_score(y_test, y_pred, average='micro')

    print("[PROGRAM]: classifier -> BERT finetuned")
    print("[PROGRAM]: avg precision: {}".format(precision))
    print("[PROGRAM]: avg recall: {}".format(recall))
    print("[PROGRAM]: avg f1-score: {}".format(f1))
    print("[PROGRAM]: avg hamming loss: {}".format(hamming))
    print("[PROGRAM]: avg jacard score: {}".format(jacard))

    return [precision, recall, f1, hamming, jacard]

def score_per_tag(y_pred, y_test):
    hamming = []
    jaccard = []
    precision, recall, fscore, support = score(y_test, y_pred)
    for i, (test, pred) in enumerate(zip(y_test.T, y_pred.T)):
        hamming.append(hamming_loss(test, pred))
        jaccard.append(jaccard_score(test,pred))

    # DEBUG
    # print(len(precision))
    # print(len(recall))
    # print(len(fscore))
    # print(len(support))
    # print(len(hamming))
    # print(len(jaccard))
    # print(len(y_classes))

    return pd.DataFrame(data=[precision, recall, fscore, support, hamming, jaccard],
                         index=["Precision", "Recall", "F-1 score", "True count", "Hamming loss", "Jaccard score"],
                         columns=mlb.classes_)
     

In [35]:

################################ INFERENCE TEST-SET
model.eval()
y_test = []
y_pred = []
with torch.no_grad():
    for batch_idx, data in enumerate(test_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)
        y_test.extend(targets.cpu().detach().numpy().tolist())
        y_pred.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        print("[PROGRAM]: INFERENCE BATCH ", batch_idx," /446")

# applying hard map of probas into (0,1)
y_pred = (np.array(y_pred) > 0.5).astype(int)
y_pred = np.array(y_pred)
y_test = np.array(y_test)

[PROGRAM]: INFERENCE BATCH  0  /446
[PROGRAM]: INFERENCE BATCH  1  /446
[PROGRAM]: INFERENCE BATCH  2  /446
[PROGRAM]: INFERENCE BATCH  3  /446
[PROGRAM]: INFERENCE BATCH  4  /446
[PROGRAM]: INFERENCE BATCH  5  /446
[PROGRAM]: INFERENCE BATCH  6  /446
[PROGRAM]: INFERENCE BATCH  7  /446
[PROGRAM]: INFERENCE BATCH  8  /446
[PROGRAM]: INFERENCE BATCH  9  /446
[PROGRAM]: INFERENCE BATCH  10  /446
[PROGRAM]: INFERENCE BATCH  11  /446
[PROGRAM]: INFERENCE BATCH  12  /446
[PROGRAM]: INFERENCE BATCH  13  /446
[PROGRAM]: INFERENCE BATCH  14  /446
[PROGRAM]: INFERENCE BATCH  15  /446
[PROGRAM]: INFERENCE BATCH  16  /446
[PROGRAM]: INFERENCE BATCH  17  /446
[PROGRAM]: INFERENCE BATCH  18  /446
[PROGRAM]: INFERENCE BATCH  19  /446
[PROGRAM]: INFERENCE BATCH  20  /446
[PROGRAM]: INFERENCE BATCH  21  /446
[PROGRAM]: INFERENCE BATCH  22  /446
[PROGRAM]: INFERENCE BATCH  23  /446
[PROGRAM]: INFERENCE BATCH  24  /446
[PROGRAM]: INFERENCE BATCH  25  /446
[PROGRAM]: INFERENCE BATCH  26  /446
[PROGRAM]: 

In [36]:

################################# METRICS (micro-average)
print("[INFO]: computing micro-average metrics for all tags")
metrics_avg = score_avg(y_pred, y_test)
metrics_per_tag = score_per_tag(y_pred, y_test)

[INFO]: computing micro-average metrics for all tags
[PROGRAM]: classifier -> BERT finetuned
[PROGRAM]: avg precision: 0.8535921306868084
[PROGRAM]: avg recall: 0.7378530215608867
[PROGRAM]: avg f1-score: 0.7915139669354183
[PROGRAM]: avg hamming loss: 0.01953452880579931
[PROGRAM]: avg jacard score: 0.6549632724577128


In [37]:

################################ METRICS ON TOP TEN TAGS
top_ten_tags = ["javascript", "java", "c#", "php", "android", "jquery", "python", "html", "c++", "ios"]
print("[INFO]: computing top-ten tag metrics")
print(metrics_per_tag[top_ten_tags])
print("[INFO]: computing top-ten tag metrics averaged")
print(metrics_per_tag[top_ten_tags].apply(np.mean, axis=1))

[INFO]: computing top-ten tag metrics
                javascript         java           c#         php      android  \
Precision         0.829060     0.924860     0.823572    0.940741     0.964066   
Recall            0.727273     0.818825     0.811103    0.818035     0.909884   
F-1 score         0.774838     0.868619     0.817290    0.875108     0.936191   
True count     1067.000000  1413.000000  1387.000000  621.000000  1032.000000   
Hamming loss      0.043018     0.033384     0.047978    0.013831     0.012209   
Jaccard score     0.632437     0.767750     0.691032    0.777948     0.880037   

                   jquery      python        html         c++         ios  
Precision        0.880086    0.936916    0.535238    0.884024    0.779570  
Recall           0.740541    0.920781    0.643021    0.850797    0.786618  
F-1 score        0.804305    0.928778    0.584200    0.867092    0.783078  
True count     555.000000  871.000000  437.000000  878.000000  553.000000  
Hamming loss  