## Stackoverflow tag recommendation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import re
import warnings; warnings.simplefilter('ignore')
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
import gensim
from model import ConvText

random_state = 747

## Question Data

Stackoverflow question data downloaded from Google BigQuery and aggregated into a single csv file. 

In [None]:
file = '../../../Data/stackoverflow/questions.csv'
df = pd.read_csv(file, usecols=['title', 'tags'])
df.head(10)

Data size

In [None]:
df.shape

Data is too big for taining. We will only consider randomly selected 100,000 rows

In [None]:
df = shuffle(df, random_state=random_state)
df = df[:1000000]

### Clean title text

In [None]:
def clean_text(text):
    text = str(text)
    text = re.sub(r"[^a-zA-Z0-9#+-]", " ", text.lower())
    return text

df['title'] = df['title'].apply(clean_text)
df.head(10)

### Save for later use

In [None]:
df.to_csv('../../../Data/stackoverflow/clean_questions.csv')

### Load data

In [None]:
df = pd.read_csv('../../../Data/stackoverflow/clean_questions.csv')
df.head(10)

In [None]:
df.isnull().sum()

### Save data as list and basic exploration

In [None]:
questions = df['title'].tolist()
tags = df['tags'].tolist()

Find the number of words in the vocabulary

In [None]:
print('The total number of words in the data is: ', sum([len(text.split()) for text in questions]))

def tokenize_question(text):
    return text.split()

question_vect = CountVectorizer(tokenizer=tokenize_question)
question_vect.fit(questions)

print('The number of words in the vocabulary is: ', len(question_vect.vocabulary_))

Find the number of tags

In [None]:
def tokenize_tags(text):
    return text.split('|')

tags_vect = CountVectorizer(tokenizer=tokenize_tags)
tags_vect.fit(tags)

print('The total number of tags is: ', len(tags_vect.vocabulary_))

#### Save tag label as vectorized tokens.

There are too many tags to predict. In our model we will only look at the top 100 tags and save the result

In [None]:
max_tags = 100

def tokenize_tags(text):
    return text.split('|')

tags_vect = CountVectorizer(tokenizer=tokenize_tags, max_features=max_tags)
tags = tags_vect.fit_transform(tags)
tags = tags.toarray()
print('Number of tags: ', len(tags_vect.vocabulary_))

tags_token = tags_vect.get_feature_names()
tag_frequency = tags.sum(axis=0)
print('The list of tags with frequency is: ')
print(dict(zip(tags_token, tag_frequency)))

Histogram of number of tags in each question

In [None]:
plt.hist(tags.sum(axis=1))
plt.xlabel('Number of tags for a question')
plt.ylabel('Number of questions')

### Split test and train data set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(questions, tags, 
                                                    test_size=0.1, 
                                                    random_state=random_state)

### Sample weight
Evaluate sample weight from y_train data

In [None]:
sample_weight = compute_sample_weight('balanced', y_test)
sample_weight

In [None]:
np.sum(sample_weight)

### Evaluation metrics

In [None]:
def eval_metrics(y_test, y_predicted, print_metrics=True):
    
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    
    if print_metrics:
        print("accuracy: %.3f - precision: %.3f - recall: %.3f - f1: %.3f" % (
            accuracy, precision, recall, f1))
    return accuracy, precision, recall, f1

# Simple first order model: Bag of words with logistic regression


### Bag of words embedding for quesitons

Remove common words and words appearing very less number of times from the corpus

In [None]:
def tokenize_question(text):
    return text.split()

bag_vect = CountVectorizer(tokenizer=tokenize_question,
                               stop_words='english',
                               min_df=3,
                               max_df=0.5)

X_train_bag = bag_vect.fit_transform(X_train)
X_test_bag = bag_vect.transform(X_test)
print('The number of words in the vocabulary is: ', len(bag_vect.vocabulary_))

### Logistic regression with binary relevance
Since this is a multi-label classificaiton, we will use binary relevance on top of logistical regression. This basically splits each label as a seperate classification. 

In [None]:
%%time
bag_log_clf = OneVsRestClassifier(LogisticRegression())

bag_log_clf.fit(X_train_bag, y_train)

print('Train score')
y_train_bag_predict = bag_log_clf.predict(X_train_bag)
eval_metrics(y_train, y_train_bag_predict)

print('Test score')
y_test_bag_predict = bag_log_clf.predict(X_test_bag)
eval_metrics(y_test, y_test_bag_predict)

### Important words

In [None]:
importances = importances = bag_log_clf.estimators_[0].coef_[0]
for i in range(1,len(bag_log_clf.estimators_)):
    importances += bag_log_clf.estimators_[i].coef_[0]

importances = importances/sum(importances)
    
feature_imps = {'importances':importances, 'feature':bag_vect.get_feature_names()}
feature_imps = pd.DataFrame(feature_imps)
feature_imps = feature_imps.sort_values('importances', ascending=False)
# Normalize importance and add cumulative importance
feature_imps['cum_imp'] = feature_imps['importances'].cumsum()
feature_imps['importances'] = feature_imps['importances']/feature_imps['importances'].max()
feature_imps = feature_imps.reset_index(drop=True)
feature_imps['no_features'] = feature_imps.index + 1
feature_imps[['feature', 'importances', 'cum_imp']].head(20)

## TFIDF with logistic regression and binary relevance

### TFIDF on bag of words embedding for quesitons

In [None]:
def tokenize_question(text):
    return text.split()

tfidf_vect = TfidfVectorizer(tokenizer=tokenize_question,
                               stop_words='english',
                               min_df=4,
                               max_df=0.5)

X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)
print('The number of words in the vocabulary is: ', len(tfidf_vect.vocabulary_))

### Logistic regression with binary relevance

In [None]:
%%time
tfidf_log_clf = OneVsRestClassifier(LogisticRegression())

tfidf_log_clf.fit(X_train_tfidf, y_train)

print('Train score')
y_train_tfidf_predict = tfidf_log_clf.predict(X_train_tfidf)
eval_metrics(y_train, y_train_tfidf_predict)

print('Test score')
y_test_tfidf_predict = tfidf_log_clf.predict(X_test_tfidf)
eval_metrics(y_test, y_test_tfidf_predict)

### Importance words

In [None]:
importances = importances = tfidf_log_clf.estimators_[0].coef_[0]
for i in range(1,len(tfidf_log_clf.estimators_)):
    importances += tfidf_log_clf.estimators_[i].coef_[0]

importances = importances/sum(importances)
    
feature_imps = {'importances':importances, 'feature':tfidf_vect.get_feature_names()}
feature_imps = pd.DataFrame(feature_imps)
feature_imps = feature_imps.sort_values('importances', ascending=False)
# Normalize importance and add cumulative importance
feature_imps['cum_imp'] = feature_imps['importances'].cumsum()
feature_imps['importances'] = feature_imps['importances']/feature_imps['importances'].max()
feature_imps = feature_imps.reset_index(drop=True)
feature_imps['no_features'] = feature_imps.index + 1
feature_imps[['feature', 'importances', 'cum_imp']].head(20)

## TFIDF with Naives Bayes Classifier and binary relevance

### TFIDF on bag of words embedding for quesitons

In [None]:
def tokenize_question(text):
    return text.split()

tfidf_vect = TfidfVectorizer(tokenizer=tokenize_question,
                               stop_words='english',
                               min_df=4,
                               max_df=0.5)

X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)
print('The number of words in the vocabulary is: ', len(tfidf_vect.vocabulary_))

### Naives Bayes with binary relevance

In [None]:
%%time
tfidf_log_clf = OneVsRestClassifier(GaussianNB())

tfidf_log_clf.fit(X_train_tfidf, y_train)

print('Train score')
y_train_tfidf_predict = tfidf_log_clf.predict(X_train_tfidf)
eval_metrics(y_train, y_train_tfidf_predict)

print('Test score')
y_test_tfidf_predict = tfidf_log_clf.predict(X_test_tfidf)
eval_metrics(y_test, y_test_tfidf_predict)

### Importance Words

In [None]:
importances = importances = tfidf_log_clf.estimators_[0].coef_[0]
for i in range(1,len(tfidf_log_clf.estimators_)):
    importances += tfidf_log_clf.estimators_[i].coef_[0]

importances = importances/sum(importances)
    
feature_imps = {'importances':importances, 'feature':tfidf_vect.get_feature_names()}
feature_imps = pd.DataFrame(feature_imps)
feature_imps = feature_imps.sort_values('importances', ascending=False)
# Normalize importance and add cumulative importance
feature_imps['cum_imp'] = feature_imps['importances'].cumsum()
feature_imps['importances'] = feature_imps['importances']/feature_imps['importances'].max()
feature_imps = feature_imps.reset_index(drop=True)
feature_imps['no_features'] = feature_imps.index + 1
feature_imps[['feature', 'importances', 'cum_imp']].head(20)

# Convolutional sentence classification model

## Load Word2Vec

In [None]:
import gensim

word2vec_path = '../../../Data/stackoverflow/GoogleNews-vectors-negative300.bin.gz'
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

### Sequence length

In [None]:
plt.hist([len(x) for x in X_train], bins=50)
plt.xlabel('Length of sequence')
plt.ylabel('number')

### Hyerparameters related to sentece creation

In [None]:
max_seq_length = 35
vocab_size = 100000
embedding_size = 300

### Text preprocessing

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(lower=True, split=' ', filters='')
tokenizer.fit_on_texts(X_train)
tokenizer.num_words = vocab_size

seq_train = tokenizer.texts_to_sequences(X_train)
seq_test = tokenizer.texts_to_sequences(X_test)

seq_length = [len(x) for x in seq_train]
max_seq_length = max(seq_length)

seq_train = pad_sequences(seq_train, maxlen=max_seq_length, padding='post', truncating='post')
seq_test = pad_sequences(seq_test, maxlen=max_seq_length, padding='post', truncating='post')

embedding_weights = np.zeros((vocab_size+1, embedding_size))
word2vec_counter, randword_counter = 0, 0
for word, index in tokenizer.word_index.items():
    if index > vocab_size:
        break
        
    if word in word2vec:
        embedding_weights[index,:] = word2vec[word]
        word2vec_counter += 1
    else:
        embedding_weights[index,:] = np.random.normal(0, 0.15, embedding_size)
        randword_counter += 1
        
print(embedding_weights.shape)

In [None]:
plt.hist(seq_length, bins=50)
plt.xlabel('Length of sequence')
plt.ylabel('number')

In [2]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Input, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, concatenate, Dropout
from tensorflow.keras.models import Model

In [7]:
embedding_weights = np.zeros((35000+1, 300))

max_seq_length=35
embedding_layer = Embedding(embedding_weights.shape[0],
                            embedding_weights.shape[1],
                            weights=[embedding_weights],
                            input_length=max_seq_length,
                            trainable=True,
                            name='Embedding_lookup')

sequence_input = Input(shape=(max_seq_length,), dtype='int32', name='Question_Sequence')
embedded_sequences = embedding_layer(sequence_input)

convs = []
filter_sizes = [3,4,5]

l_conv = Conv1D(32,3,activation='relu',strides=1,name='layer1_filtsz_3_stride_1')(embedded_sequences)
convs.append(l_conv)
l_conv = Conv1D(32,4,activation='relu',strides=1,name='layer1_filtsz_4_stride_1')(embedded_sequences)
convs.append(l_conv)
l_conv = Conv1D(32,5,activation='relu',strides=1,name='layer1_filtsz_5_stride_1')(embedded_sequences)
convs.append(l_conv)


l_merge = concatenate(convs, axis=1, name='layer1_merge')

l_cov1= Conv1D(32, 5, activation='relu',strides=2, name='layer2_filtsz_5_stride_2')(l_merge)
# l_pool1 = MaxPooling1D(3)(l_cov1)


l_flat = Flatten(name='layer3')(l_cov1)
l_dense = Dense(250, activation='relu',name='layer3_dense')(l_flat)
preds = Dense(100, activation='softmax',name='logit_dense_sigmoid')(l_dense)


model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("model fitting - simplified convolutional neural network")
model.summary()

model fitting - simplified convolutional neural network
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Question_Sequence (InputLayer)  (None, 35)           0                                            
__________________________________________________________________________________________________
Embedding_lookup (Embedding)    (None, 35, 300)      10500300    Question_Sequence[0][0]          
__________________________________________________________________________________________________
layer1_filtsz_3_stride_1 (Conv1 (None, 33, 32)       28832       Embedding_lookup[0][0]           
__________________________________________________________________________________________________
layer1_filtsz_4_stride_1 (Conv1 (None, 32, 32)       38432       Embedding_lookup[0][0]           
_____________________________________________________

In [8]:
import tensorflow as tf
tf.keras.utils.plot_model(
    model,
    to_file='model.png',
    show_shapes=True,
    show_layer_names=True
)

In [None]:
y_train.dtype

In [None]:
np.max(tags)
max_seq_length = 35
vocab_size = 100000
embedding_size = 300

In [None]:
from model import ConvText
conv_model = ConvText(max_seq_length, max_tags, embedding_weights)

In [None]:
conv_model.fit(x=seq_train[:200000,:], y=y_train[:200000,:], batch_size=100, val_x=seq_test, val_y=y_test, epochs=3)

In [None]:
y_train.dtype

In [None]:
from model2 import ConvText
conv_model = ConvText(max_seq_length, max_tags, embedding_weights)
conv_model.fit(x=seq_train[:200000,:], y=y_train[:200000,:], batch_size=100, val_x=seq_test, val_y=y_test, epochs=3)