In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from tqdm import tqdm # it is handly library which provide percentage progress bar while executing for loops
import matplotlib.pyplot as plt

%matplotlib inline
pd.set_option('display.max_colwidth', 200)

In [3]:
df_questions = pd.read_hdf('auto_tagging_data_v2.h5')

In [4]:
df_questions['Text'] = df_questions['Title'] + " " + df_questions['Body']

In [5]:
def clean_text(text):
    
    text = re.sub(r'<.*?>', '', text) # this removes html tags and url links in the text
    
    text = re.sub("[^a-zA-Z]"," ",text) # this will remove everything except alphabets
    
    text = ' '.join(text.split()) # # this will remove extra or white spaces in the text
    
    return text

In [6]:
df_questions['Text'] = df_questions['Text'].apply(lambda x: clean_text(x))

In [7]:
df_questions['Text'] = df_questions['Text'].str.lower()

In [8]:
df_questions[['Id','Text','Tags']].sample(5)

Unnamed: 0,Id,Text,Tags
46536,11315,how does the distribution of the error term affect the distribution of the response so when i assume that the error terms are normally distributed in a linear regression what does it mean for the ...,"[regression, distributions]"
25706,48063,distribution expected length of the shortest path in infinite random geometric graphs consider an infinite random geometric graph g rho d in which vertices are uniformly and independently scattere...,"[probability, stochastic-processes, pdf]"
50808,105171,confidence intervals for predictors in multivariate logistic regression i ve got a question i am dealing with medical data which contain predictors and binary outcome when i try to classify the da...,"[regression, logistic, confidence-interval]"
33445,152369,does rank of observation matrix tell anything useful when applying machine learning suppose i have an observation matrix of size n times m where n is the number of samples and m is the number of v...,[machine-learning]
39207,141400,how many people initially had apples story problem assume apples are distributed across x unknown people where each person has at least one apple for each apple a biased coin is flipped to see if ...,"[estimation, sampling, nonparametric]"


### Encode Text to Numbers

In [9]:
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences

# Without running in the beginning, writing at the later stage is throwing error

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_questions['Text']) # It tokenizes the text as well as keeps the index of the resultant tokens

                                             # This is really useful as it helps in accessing the words easily

In [11]:
# check unique words count
len(tokenizer.word_index)

81956

### Converting the text into sequence of integers

In [12]:
sequences = tokenizer.texts_to_sequences(df_questions['Text'])

In [13]:
# Let us see few sample sentences and their corresponding sequence integers

i = 0 # index
print(df_questions['Text'][i], '\n'), print(sequences[i])

the two cultures statistics vs machine learning last year i read a blog post from brendan o connor entitled statistics vs machine learning fight that discussed some of the differences between the two fields andrew gelman responded favorably to this simon blomberg from r s fortunes package to paraphrase provocatively machine learning is statistics minus any checking of models and assumptions brian d ripley about the difference between machine learning and statistics user vienna may season s greetings andrew gelman in that case maybe we should get rid of checking of models and assumptions more often then maybe we d be able to solve some of the problems that the machine learning people can solve but we can t there was also the statistical modeling the two cultures paper by leo breiman in which argued that statisticians rely too heavily on data modeling and that machine learning techniques are making progress by instead relying on the predictive accuracy of models has the statistics field 

(None, None)

In [14]:
i = 3 # index
print(df_questions['Text'][i], '\n'), print(sequences[i])

what is the meaning of p values and t values in statistical tests after taking a statistics course and then trying to help fellow students i noticed one subject that inspires much head desk banging is interpreting the results of statistical hypothesis tests it seems that students easily learn how to perform the calculations required by a given test but get hung up on interpreting the results many computerized tools report test results in terms of p values or t values how would you explain the following points to college students taking their first course in statistics what does a p value mean in relation to the hypothesis being tested are there cases when one should be looking for a high p value or a low p value what is the relationship between a p value and a t value 

[39, 6, 1, 995, 5, 32, 71, 7, 17, 71, 8, 235, 267, 209, 671, 3, 255, 555, 7, 81, 158, 4, 162, 8771, 863, 2, 1786, 50, 302, 11, 31537, 237, 1114, 12413, 15216, 6, 1182, 1, 133, 5, 235, 260, 267, 16, 218, 11, 863, 1209, 6

(None, None)

We can see the integers corresponding to the text of each

In [15]:
# padding : Making sequences of same length

# The model will accept the inputs of the same size i.e. the integer sequences derive from text need to be of same size or length


In [16]:
# Let us check the distribution of sequences

# Finding right maximum length for the sequences

seq_lengths = []

for i in sequences:
    seq_lengths.append(len(i))

In [17]:
print("30th percentile: ", pd.Series(seq_lengths).quantile(0.3))
print("40th percentile: ", pd.Series(seq_lengths).quantile(0.4))
print("50th percentile: ", pd.Series(seq_lengths).quantile(0.5))
print("60th percentile: ", pd.Series(seq_lengths).quantile(0.6))
print("70th percentile: ", pd.Series(seq_lengths).quantile(0.7))
print("80th percentile: ", pd.Series(seq_lengths).quantile(0.8))
print("90th percentile: ", pd.Series(seq_lengths).quantile(0.9))
print("95th percentile: ", pd.Series(seq_lengths).quantile(0.95))
print("99th percentile: ", pd.Series(seq_lengths).quantile(0.99))

30th percentile:  97.0
40th percentile:  116.0
50th percentile:  137.0
60th percentile:  162.0
70th percentile:  193.0
80th percentile:  238.0
90th percentile:  320.0
95th percentile:  411.0
99th percentile:  678.0


Half of the sequences at 50th percentile are less than 124 and half of them are larger than it.

Study well --> Bit confusing

Max_length is a hyper parameter. Having less number leads to loss of information and takes less time to train the model. Higher value for the max length leads to less loss of information and time confusing due to padding with zeros. Hence it is a hyper paramter to tune.

In [18]:
max_length = 125

# padding
padded_seq = pad_sequences(sequences, maxlen=max_length)

### Reshape Target Variable

In [19]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_questions['Tags'])
y = multilabel_binarizer.transform(df_questions['Tags'])

In [20]:
padded_seq.shape, y.shape

((76365, 125), (76365, 100))

In [21]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(padded_seq, y, 
                                                    test_size=0.2, 
                                                    random_state=9)

### Model Building

In [22]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Sequential - Model Building
# load_model - laading and save model to memory
# Earlystoppin & ModelCheckpoint - To save time while training model

In the Spam-Ham classification, we have used functional API of Keras here we are going to use Sequential API to buid the multilabel classification model.

In [23]:
model = Sequential() 

# this layer is useful to create word vectors. This job is similar to word2vec or Glove
# len(tokenizer.word_index)+1 ==> size of the vocabulary
# the additional 1 is for the integer we get by the zeros used for padding
# 128 ==> size of the embedding which means the size of the word vectors

model.add(Embedding(len(tokenizer.word_index)+1, 128, input_length = max_length)) 

model.add(Dropout(0.15)) #15% dropping out which helps in reducing overfitting 

model.add(Conv1D(300, 3, padding = 'valid', activation = "relu", strides = 1)) #

model.add(GlobalMaxPool1D()) # This extracts valuable features from sequences

model.add(Dense(100, activation = "sigmoid")) # Since we need to predict the probabilities, we are using sigmoid

#model.add(Activation('sigmoid'))

In [24]:
model.compile(optimizer='adam',loss = 'binary_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 125, 128)          10490496  
_________________________________________________________________
dropout (Dropout)            (None, 125, 128)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 123, 300)          115500    
_________________________________________________________________
global_max_pooling1d (Global (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 100)               30100     
Total params: 10,636,096
Trainable params: 10,636,096
Non-trainable params: 0
_________________________________________________________________


We can see 1 million trainable parameters(10,636,096). More number of Parameters takes more time to train the model, since Deep Learning model takes more time train as compare to ML Models it is advisable to use the features like early stopping and saving the best model for later use. We can use this features using callbacks from keras.

In [25]:
callbacks = [
             EarlyStopping(patience=3), # It will wait for 3 epochs, even after 3 ephocs performance doesn't then training of the model will stop
             ModelCheckpoint(filepath='model-conv1d_v1.h5', save_best_only=True)
            ]   # our best model will be saved with the name 'model-conv1d_v1.h5' in our working directory

                # save_best_only = True, it saves best model on performance using validation set

In [None]:
# train model
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=512,
                    validation_split=0.1,
                    callbacks=callbacks)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
  1/108 [..............................] - ETA: 0s - loss: 0.6934 - accuracy: 0.0020

### Predictions and Performane Evaluation

In [None]:
# use the code below to load the saved model. It appears similar to pickle
# model = load_model('model-conv1d_v1.h5') 

In [None]:
preds = model.predict(x_val) # These predictions are probabilities. Therefor we will be converting to binary using threshold

In [None]:
preds.shape

In [None]:
# set threshold to 0.45

# we can also try different values

preds_int = (preds >= 0.45).astype(int)

In [None]:
from sklearn.metrics import f1_score

# calculate F1 score
f1_score(y_val, preds_int, average="micro")

f1score() of 0.510 is observer which is good improvement...We can keep increase the number of epochs as long as the validation loss continues to decline.

We can also try changing the architecture of the model (add more convolutional layers, play around with number of units in each layer, change the value of dropout etc). We can also try completely different architectures such as RNN/LSTM...

### Inference

In [None]:
# model = load_model('model-conv1d_v1.h5')

In [None]:
def infer_tags(q):
    q = clean_text(q)
    q = q.lower()
    q_seq = tokenizer.texts_to_sequences([q])
    q_seq_padded = pad_sequences(q_seq, maxlen=300)
    q_pred = model.predict(q_seq_padded)
    q_pred = (q_pred >= 0.3).astype(int)
    
    return multilabel_binarizer.inverse_transform(q_pred)

In [None]:
# give new question
new_q = "Regression line in ggplot doesn't match computed regression Im using R and created a chart using ggplot2. I then create a regression so I can make some predicitions I pass my data frame of to the predict function predict(regression, Measures) I'd expect the predictions to be the same as if I used the regression line on the chart, but they aren't the same. Why would this be the case? Is there a setting in ggplot or is my expectation incorrect?"

# get tags
infer_tags(new_q)