# **StackOverflow Top10 tags**

The aim is here to build a model to predict the tag associated with the text. (Here we will build a model for the top 10 tags only)

In [0]:
#Intially we will import the dataset into the Notebook..

In [0]:
!pip install -U -q PyDrive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
downloaded= drive.CreateFile({'id': '1h4Xd0WgRd14Nv0PRVypkqOt11hNteCnS'})
downloaded.GetContentFile('stacksample.zip')

In [7]:
!unzip stacksample.zip

Archive:  stacksample.zip
  inflating: Answers.csv             
  inflating: Questions.csv           
  inflating: Tags.csv                


In [0]:
#Importing required packages..
import numpy as np
import pandas as pd

In [0]:
questions_data= pd.read_csv('Questions.csv', encoding='iso-8859-1')
tags_data= pd.read_csv('Tags.csv', encoding='iso-8859-1')

In [0]:
#filtering the top_10 most associated tags from the entire dataset
top_10= list(tags_data['Tag'].value_counts().head(10).index)

In [4]:
top_10

['javascript',
 'java',
 'c#',
 'php',
 'android',
 'jquery',
 'python',
 'html',
 'c++',
 'ios']

In [0]:
tags_data= tags_data[tags_data['Tag'].isin(top_10)]

In [0]:
questions= pd.merge(tags_data, questions_data, how='inner', on=['Id'])

# **Data Preprocessing**

In [0]:
#Now, I'll take only quesions with a score greater than 5. I'm doing that for 2 reasons:
  #1- I'll require less computational resources.
  #2- The posts will probably be with a better quality and will be better tagged since they have lots of upvotes.

In [8]:
questions.shape

(826739, 8)

In [0]:
questions= questions[questions['Score']>5]

In [10]:
questions.shape

(43800, 8)

In [11]:
questions.head(3) #Here we see that [Id, OwnerUserId, CreationDate, ClosedDate, Score] will have no impact on predicting the tags;
                  #hence we will not consider those for prediction.

Unnamed: 0,Id,Tag,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,260,c#,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
1,330,c++,63.0,2008-08-02T02:51:36Z,,29,Should I use nested classes in this case?,<p>I am working on a collection of classes use...
2,650,c#,143.0,2008-08-03T11:12:52Z,,79,Automatically update version number,<p>I would like the version property of my app...


In [0]:
questions= questions[['Tag','Title','Body']]

In [13]:
questions.head(3)

Unnamed: 0,Tag,Title,Body
0,c#,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
1,c++,Should I use nested classes in this case?,<p>I am working on a collection of classes use...
2,c#,Automatically update version number,<p>I would like the version property of my app...


In [14]:
#We will check whether we have any null values in the data set
questions.isnull().mean(axis=0) #No null values

Tag      0.0
Title    0.0
Body     0.0
dtype: float64

In [15]:
#We will check whether we have any duplicate values in the data set
questions.duplicated().sum() #No duplicate values

0

In [0]:
# #In the next two columns: Body and Title, I'll use lots of text processing:
#     Removing html format
#     Lowering text
#     Transforming abbreviations
#     Removing punctuation (but keeping words like c# since it's the most popular tag)
#     Lemmatizing words
#     Removing stop words

In [0]:
#Importing the required package.
import bs4
from bs4 import BeautifulSoup
import re

In [0]:
#Converting html to text in the BODY column.
questions['Body']= questions['Body'].apply(lambda x: bs4.BeautifulSoup(x).get_text())

In [0]:
#Converting html to text in the TITLE column.
questions['Title']= questions['Title'].apply(lambda x: bs4.BeautifulSoup(x).get_text())
questions['Title']= questions['Title'].apply(lambda x: str(x))

In [0]:
#Defining a function to clean the texts.
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [0]:
#Cleaning the texts in the BODY column.
questions['Body']= questions['Body'].apply(lambda x: clean_text(x))

In [0]:
#Cleaning the texts in the TITLE column.
questions['Title']= questions['Title'].apply(lambda x: clean_text(x))

In [23]:
questions.head(3)

Unnamed: 0,Tag,Title,Body
0,c#,adding scripting functionality to .net applica...,i have a little game written in c#. it uses a ...
1,c++,should i use nested classes in this case?,i am working on a collection of classes used f...
2,c#,automatically update version number,i would like the version property of my applic...


In [0]:
#punct= '!"$%&\'()*,./:;<=>?@[\\]^_`{|}~'

In [0]:
#Importing the required package
#import nltk
#from nltk import tokenize

In [0]:
#token= tokenize.ToktokTokenizer()

In [0]:
# def strip_list_noempty(mylist):
#   newlist= (item.strip() if hasattr(item, 'strip') else item for item in mylist)
#   return [item for item in newlist if item !='']

In [0]:
# def clean_punct(text):
#   words= token.tokenize(text)
#   punctuation_filtered=[]
#   regex= re.compile('[%s]' % re.escape(punct))
#   remove_punctuation= str.maketrans(' ', ' ', punct)

#   for w in words:
#     if w in top_10 :
#       punctuation_filtered.append(w)
#     else:
#       punctuation_filtered.append(regex.sub('',w))
#   filtered_list= strip_list_noempty(punctuation_filtered)
#   return ' '.join(map(str, filtered_list))

In [0]:
#Cleaning the punctuations from the BODY column.
#questions['Body']= questions['Body'].apply(lambda x: clean_punct(x))

In [0]:
#Cleaning the punctuations from the TITLE column.
#questions['Title']= questions['Title'].apply(lambda x: clean_punct(x))

In [0]:
#Validation of text in the BODY post cleaning...
#questions['Body'][1]

In [0]:
#Validation of the text in the TITLE post cleaning...
#questions['Title'][1]

# **Model Builing**

In [0]:
#Importing the required packages.
import nltk
nltk.download('punkt')
from nltk import word_tokenize

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, BatchNormalization, GRU, concatenate
from keras.models import Model

In [0]:
questions_train, questions_test= train_test_split(questions, test_size=0.2, random_state=100)

In [0]:
x_train_t= questions_train['Title']
x_train_b= questions_train['Body']
y_train= pd.get_dummies(questions_train['Tag'])

x_test_t= questions_test['Title']
x_test_b= questions_test['Body']
y_test= pd.get_dummies(questions_test['Tag'])

***For Title...***

In [36]:
sent_lens_t=[]
for sent in questions_train['Title']:
  sent_lens_t.append(len(word_tokenize(sent)))

max(sent_lens_t)

39

In [37]:
np.quantile(sent_lens_t,0.95)

17.0

In [0]:
#As we see that 95& of the word has lenth of 17, hence we will set the max length to 17
max_len_t=17

In [0]:
tok_t= Tokenizer(char_level=False, split=' ')
tok_t.fit_on_texts(x_train_t)
sequences_train_t= tok_t.texts_to_sequences(x_train_t)
sequences_test_t= tok_t. texts_to_sequences(x_test_t) #Applying same for Test data

In [65]:
vocab_len_t= len(tok_t.index_word.keys()) #verifying vocabulory length.
vocab_len_t

15812

In [0]:
sequences_matrix_train_t= sequence.pad_sequences(sequences_train_t, maxlen= max_len_t)
sequences_matrix_test_t= sequence.pad_sequences(sequences_test_t, maxlen= max_len_t) #Applying the same for Test data.

In [67]:
sequences_matrix_train_t

array([[   0,    0,    0, ...,  100,    2,   51],
       [   0,    0,    0, ...,   30,  162,   48],
       [   0,    0,    0, ...,  230,    1,  859],
       ...,
       [   0,    0,    0, ..., 2535,   29,  379],
       [   0,    0,    0, ...,   64,   14,   98],
       [   0,    0,    0, ...,    5,  386,  250]], dtype=int32)

In [68]:
sequences_matrix_test_t

array([[   0,    0,    0, ...,  873,   42, 1104],
       [ 137,   39,  581, ...,  902,  581, 1257],
       [   0,    0,    0, ...,  226,    8,   84],
       ...,
       [   0,    0,    0, ...,  560,    2,   50],
       [   0,    0,    0, ...,   59,    9,  159],
       [   0,    0,    0, ...,   17,   22, 3800]], dtype=int32)

***For Body...***

In [69]:
sent_lens_b=[]
for sent in questions_train['Body']:
  sent_lens_b.append(len(word_tokenize(sent)))

max(sent_lens_b)

10972

In [71]:
np.quantile(sent_lens_b, 0.92)

495.0

In [0]:
#As we see that 92& of the word has length of 495, hence we will set the max length to 500
max_len_b=500

In [0]:
tok_b= Tokenizer(char_level=False, split=' ')
tok_b.fit_on_texts(x_train_b)
sequences_train_b= tok_b.texts_to_sequences(x_train_b)
sequences_test_b= tok_b.texts_to_sequences(x_test_b)  #Applying same for Test data

In [74]:
vocab_len_b= len(tok_b.index_word.keys())      #verifying vocabulory length.
vocab_len_b

149531

In [0]:
sequences_matrix_train_b= sequence.pad_sequences(sequences_train_b, maxlen= max_len_b)
sequences_matrix_test_b= sequence.pad_sequences(sequences_test_b, maxlen= max_len_b)     #Applying the same for Test data.

In [76]:
sequences_matrix_train_b

array([[    0,     0,     0, ...,    12,  1710,    10],
       [    0,     0,     0, ...,   532,     8,   305],
       [    0,     0,     0, ...,     3,    57, 87314],
       ...,
       [    0,     0,     0, ...,   343,   861,    33],
       [    0,     0,     0, ...,  5746,    94,   178],
       [    0,     0,     0, ...,    36,   236,   152]], dtype=int32)

In [77]:
sequences_matrix_test_b

array([[    0,     0,     0, ...,    20,     6,  2033],
       [    0,     0,     0, ...,   710,     9,   433],
       [    0,     0,     0, ...,     1,   247,   381],
       ...,
       [    0,     0,     0, ...,   208, 10069,   831],
       [    0,     0,     0, ...,    90,   243,   293],
       [   14,   227,   139, ...,  1665,     3,   254]], dtype=int32)

Verifying the shapes of the matrices...

In [78]:
sequences_matrix_train_t.shape, sequences_matrix_train_b.shape, y_train.shape

((35040, 17), (35040, 500), (35040, 10))

In [79]:
sequences_matrix_test_t.shape, sequences_matrix_test_b.shape, y_test.shape

((8760, 17), (8760, 500), (8760, 10))

In [80]:
print(max_len_t)
print(max_len_b)

17
500


In [0]:
#Downloading the Embedding to use the pre-trained weights.. 
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip

In [63]:
#unzipping it..
!unzip glove.twitter.27B.zip

Archive:  glove.twitter.27B.zip
  inflating: glove.twitter.27B.25d.txt  
  inflating: glove.twitter.27B.50d.txt  
  inflating: glove.twitter.27B.100d.txt  
  inflating: glove.twitter.27B.200d.txt  


In [0]:
embeddings_index= {}
f= open('glove.twitter.27B.200d.txt')
for line in f:
  values= line.split()
  word= values[0]
  coefs= np.asarray(values[1:], dtype='float32')
  embeddings_index[word]= coefs
f.close()

In [0]:
embedding_matrix_t= np.zeros((len(tok_t.word_index)+1,200))
for word, i in tok_t.word_index.items():
  embedding_vector= embeddings_index.get(word)
  if embedding_vector is not None:
    #words not found in embedding index will all be set to zeros.
    embedding_matrix_t[i]= embedding_vector

In [0]:
embedding_matrix_b= np.zeros((len(tok_b.word_index)+1,200))
for word, i in tok_b.word_index.items():
  embedding_vector= embeddings_index.get(word)
  if embedding_vector is not None:
    #words not found in embedding index will all be set to zeros.
    embedding_matrix_b[i]= embedding_vector

# **Model Creation**

In [0]:
def RNN():
  #Title only...
  title_input= Input(shape=[max_len_t], name='title_input')
  title_Embed= Embedding(vocab_len_t+1, 200, weights=[embedding_matrix_t], trainable=False, input_length= max_len_t, mask_zero=True, name='title_Embed') (title_input)
  lstm_out_t= LSTM(100) (title_Embed)
  #Auxiliary output to tune LSTM weights smoothly...
  auxiliary_output= Dense(10, activation= 'sigmoid', name= 'auxiliary_output') (lstm_out_t)

  #Body only...
  body_input= Input(shape=[max_len_b], name='body_input')
  body_Embed= Embedding(vocab_len_b+1, 200, weights=[embedding_matrix_b], trainable=False, input_length= max_len_b, mask_zero=True, name='body_Embed') (body_input)
  lstm_out_b= LSTM(100) (body_Embed)
  
  #Combined with LSTM output...
  com= concatenate([lstm_out_t, lstm_out_b])

  #Now combined data is being fed to dense layers...
  dense1= Dense(50, activation='relu') (com)
  dp1= Dropout(0.3) (dense1)
  bn= BatchNormalization() (dp1)
  dense2= Dense(30, activation='relu') (bn)  
  main_output= Dense(10, activation='sigmoid', name='main_output') (dense2)

  model= Model(inputs=[title_input, body_input], outputs=[main_output, auxiliary_output])
  return model

In [87]:
model= RNN()
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title_input (InputLayer)        (None, 17)           0                                            
__________________________________________________________________________________________________
body_input (InputLayer)         (None, 500)          0                                            
__________________________________________________________________________________________________
title_Embed (Embedding)         (None, 17, 200)      3162600     title_input[0][0]                
__________________________________________________________________________________________________
body_Embed (Embedding)          (None, 500, 200)     29906400    body_input[0][0]                 
____________________________________________________________________________________________

In [0]:
model.compile(optimizer='adam', loss={'main_output':'categorical_crossentropy', 'auxiliary_output': 'categorical_crossentropy'}, metrics=['accuracy'])

**Defining ModelCheckpoint to save our model every 3 epoch...**

In [0]:
from keras.callbacks import ModelCheckpoint
import os

output_folder= './stackoverflow_output'
if not os.path.exists(output_folder):
  os.makedirs(output_folder)

filepath= output_folder+"/weights-{epoch:02d}-accuracy-{main_output_acc:.4f}.h5"
checkpoint= ModelCheckpoint(filepath, verbose=1, monitor='main_output_acc',
                            save_best_only=False,
                            save_weights_only=True,
                            mode='auto', period=10) #This will save the weights every 10 epochs

In [90]:
results= model.fit({'title_input': sequences_matrix_train_t, 'body_input': sequences_matrix_train_b},
                   {'main_output': y_train, 'auxiliary_output': y_train},
                   validation_data= [{'title_input': sequences_matrix_test_t, 'body_input': sequences_matrix_test_b},
                                     {'main_output': y_test, 'auxiliary_output': y_test}],
                   epochs= 80, batch_size= 1000, callbacks=[checkpoint]
                   )



KeyboardInterrupt: ignored

In [0]:
#Now load the saved model for prediction..
model.load_weights('/content/stackoverflow_output/weights-30-accuracy-0.8252.h5')

In [93]:
#Predicting the model..
(predict_main, predict_auxiliary)= model.predict({'title_input': sequences_matrix_test_t, 'body_input': sequences_matrix_test_b}, verbose=1)



In [0]:
#Importing required packages...
from sklearn.metrics import classification_report, f1_score

In [99]:
print(f1_score(y_test, predict_main>0.55, average='weighted'))

0.6097369926162607


In [100]:
print(classification_report(y_test, predict_main>0.55))

              precision    recall  f1-score   support

           0       0.85      0.80      0.83      1014
           1       0.97      0.21      0.34      1420
           2       0.87      0.66      0.75       891
           3       0.55      0.37      0.44       429
           4       0.88      0.66      0.75       535
           5       0.96      0.42      0.58      1317
           6       0.66      0.38      0.48      1140
           7       0.62      0.60      0.61       505
           8       0.91      0.61      0.73       580
           9       0.97      0.62      0.76       929

   micro avg       0.83      0.51      0.63      8760
   macro avg       0.83      0.53      0.63      8760
weighted avg       0.86      0.51      0.61      8760
 samples avg       0.50      0.51      0.50      8760



  _warn_prf(average, modifier, msg_start, len(result))


In [101]:
predict_main[24].round(decimals=2)

array([0.02, 0.  , 0.  , 0.  , 0.91, 0.  , 0.01, 0.  , 0.  , 0.  ],
      dtype=float32)

In [102]:
top_10

['javascript',
 'java',
 'c#',
 'php',
 'android',
 'jquery',
 'python',
 'html',
 'c++',
 'ios']

In [108]:
predict_main[].round(decimals=2)

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.99],
      dtype=float32)

In [0]:
#Saving the model..
model.save('./stackoverflow_tags_model.h5')