In [1]:
'''
# Steps to Follow
1. Load Data and Import Libraries
2. Text Cleaning
3. Merge Tags with Questions
4. Dataset Prepartion
5. Text Representation
6. Model Building
    1. Define Model Architecture
    2. Train the Model
7. Model Predictions
8. Model Evaluation
9. Inference
'''

'\n# Steps to Follow\n1. Load Data and Import Libraries\n2. Text Cleaning\n3. Merge Tags with Questions\n4. Dataset Prepartion\n5. Text Representation\n6. Model Building\n    1. Define Model Architecture\n    2. Train the Model\n7. Model Predictions\n8. Model Evaluation\n9. Inference\n'

In [2]:
# Load Data and Import Libraries
#string matching
import re 

#reading files
import pandas as pd
import numpy as np
#handling html data
from bs4 import BeautifulSoup
import zipfile
import os
#visualization
import matplotlib.pyplot as plt  


In [4]:
# load the stackoverflow questions dataset
# Specify the path to the zip file
zip_file_path = 'data/archive (2).zip'

# Specify the directory to extract to
extract_to_dir = 'data/unzipped_contents'

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all the contents into the directory
    zip_ref.extractall(extract_to_dir)
    
    # List the contents of the extracted folder
    print(f"Contents of the zip file '{zip_file_path}':")
    for file_name in zip_ref.namelist():
        print(file_name)

questions_df = pd.read_csv('data/unzipped_contents/Questions.csv',encoding='latin-1')

print(questions_df.head())
# load the tags dataset
tags_df = pd.read_csv('data/unzipped_contents/Tags.csv')

print(tags_df.head())

#print first 5 rows
questions_df.head()

print(questions_df.head())

print(questions_df.columns)

In [5]:

print(questions_df.head())


   Id  OwnerUserId          CreationDate  Score  \
0   6          5.0  2010-07-19T19:14:44Z    272   
1  21         59.0  2010-07-19T19:24:36Z      4   
2  22         66.0  2010-07-19T19:25:39Z    208   
3  31         13.0  2010-07-19T19:28:44Z    138   
4  36          8.0  2010-07-19T19:31:47Z     58   

                                                                Title  \
0                  The Two Cultures: statistics vs. machine learning?   
1                                      Forecasting demographic census   
2                 Bayesian and frequentist reasoning in plain English   
3  What is the meaning of p values and t values in statistical tests?   
4          Examples for teaching: Correlation does not mean causation   

                                                                                                                                                                                                      Body  
0  <p>Last year, I read a blog post from <a href=

   Id            Tag
0   1       bayesian
1   1          prior
2   1    elicitation
3   2  distributions
4   2      normality


In [7]:
#print first 5 rows
questions_df.head()

print(questions_df.head())

print(questions_df.columns)



   Id  OwnerUserId          CreationDate  Score  \
0   6          5.0  2010-07-19T19:14:44Z    272   
1  21         59.0  2010-07-19T19:24:36Z      4   
2  22         66.0  2010-07-19T19:25:39Z    208   
3  31         13.0  2010-07-19T19:28:44Z    138   
4  36          8.0  2010-07-19T19:31:47Z     58   

                                                                Title  \
0                  The Two Cultures: statistics vs. machine learning?   
1                                      Forecasting demographic census   
2                 Bayesian and frequentist reasoning in plain English   
3  What is the meaning of p values and t values in statistical tests?   
4          Examples for teaching: Correlation does not mean causation   

                                                                                                                                                                                                      Body  
0  <p>Last year, I read a blog post from <a href=

In [8]:
# Text Cleaning

#Let's define a function to clean the text data.
def cleaner(text):

#   text = BeautifulSoup(text).get_text()
  text = BeautifulSoup(text, features="html.parser").get_text()

  # fetch alphabetic characters
  text = re.sub("[^a-zA-Z]", " ", text)

  # convert text to lower case
  text = text.lower()

  # split text into tokens to remove whitespaces
  tokens = text.split()

  return " ".join(tokens)


In [9]:
# call preprocessing function
questions_df['cleaned_text'] = questions_df['Body'].apply(cleaner)

print(questions_df['Body'][1])

# Merge Tags with Questions
#Let's now explore the tags data.
tags_df.head()

print(tags_df.head())

# count of unique tags
len(tags_df['Tag'].unique())

print(
len(tags_df['Tag'].unique()))

print(tags_df['Tag'].value_counts())



<p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>

<p>Some of the concerns:</p>

<ul>
<li>Census blocks vary in sizes as rural
areas are a lot larger than condensed
urban areas. Is there a need to account for the area size difference?</li>
<li>if let's say I have census data
dating back to 4 - 5 census periods,
how far can i forecast it into the
future?</li>
<li>if some of the census zone change
lightly in boundaries, how can i
account for that change?</li>
<li>What are the methods to validate
census forecasts? for example, if i
have data for existing 5 census
periods, should I model the first 3
and test it on the latter two? or is
there another way?</li>
<li>what's the state of practice in
forecasting census data, and what are
some of the state of the art methods?</li>
</ul>

   Id            Tag
0   1       bayesian
1   1          prior
2   1    elicitation
3   2  distributions
4   2      normality
1315
Tag
r             

In [10]:
# remove "-" from the tags
tags_df['Tag']= tags_df['Tag'].apply(lambda x:re.sub("-"," ",x))

# group tags Id wise
tags_df = tags_df.groupby('Id').apply(lambda x:x['Tag'].values).reset_index(name='tags')
tags_df.head()

print(tags_df.head())

# merge tags and questions
df = pd.merge(questions_df,tags_df,how='inner',on='Id')

df = df[['Id','Body','cleaned_text','tags']]
print(df.head())

print(df.shape)

# Dataset Preparation
# check frequency of occurence of each tag
freq= {}
for i in df['tags']:
  for j in i:
    if j in freq.keys():
      freq[j] = freq[j] + 1
    else:
      freq[j] = 1

# sort the dictionary in descending order
freq = dict(sorted(freq.items(), key=lambda x:x[1],reverse=True))

print(freq.items())

# Top 10 most frequent tags
common_tags = list(freq.keys())[:10]
print(common_tags)


   Id                                       tags
0   1             [bayesian, prior, elicitation]
1   2                 [distributions, normality]
2   3                    [software, open source]
3   4  [distributions, statistical significance]
4   6                         [machine learning]
   Id  \
0   6   
1  21   
2  22   
3  31   
4  36   

                                                                                                                                                                                                      Body  \
0  <p>Last year, I read a blog post from <a href="http://anyall.org/">Brendan O'Connor</a> entitled <a href="http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/">"Statistics vs. Mach...   
1  <p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census blocks vary in sizes as rural\n...   
2                                

In [11]:
#We will use only those questions/queries that have the above 10 tags associated with it.
x=[]
y=[]

for i in range(len(df['tags'])):
  
  temp=[]
  for j in df['tags'][i]:
    if j in common_tags:
      temp.append(j)

  if(len(temp)>1):
    x.append(df['cleaned_text'][i])
    y.append(temp)

# number of questions left
len(x)

print(len(x))

print(y[:10])

#We will the input sequences to our model to the length of 100
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
 
y = mlb.fit_transform(y)
y.shape

print(y.shape
)

print(y[0,:])

print(mlb.classes_)


11106
[['r', 'time series'], ['regression', 'distributions'], ['distributions', 'probability', 'hypothesis testing'], ['hypothesis testing', 'self study'], ['r', 'regression', 'time series'], ['r', 'time series', 'self study'], ['probability', 'hypothesis testing'], ['r', 'regression'], ['r', 'regression'], ['regression', 'logistic']]
(11106, 10)
[0 0 0 0 0 0 1 0 0 1]
['classification' 'distributions' 'hypothesis testing' 'logistic'
 'machine learning' 'probability' 'r' 'regression' 'self study'
 'time series']


In [12]:
#We can now split the dataset into training set and validation set. 

from sklearn.model_selection import train_test_split
x_tr,x_val,y_tr,y_val=train_test_split(x, y, test_size=0.2, random_state=0,shuffle=True)

# Text Representation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 



In [13]:
#prepare a tokenizer
x_tokenizer = Tokenizer() 

#prepare vocabulary
x_tokenizer.fit_on_texts(x_tr)

print(x_tokenizer.word_index)

print(len(x_tokenizer.word_index))

#There are around 25,000 tokens in the training dataset. 
#Let's see how many tokens appear at least 5 times in the dataset.
thresh = 3

cnt=0
for key,value in x_tokenizer.word_counts.items():
  if value>=thresh:
    cnt=cnt+1

print(cnt)

#Over 12,000 tokens have appeared three times or more
#in the training set.
# prepare the tokenizer again
x_tokenizer = Tokenizer(num_words=cnt,oov_token='unk')

#prepare vocabulary
x_tokenizer.fit_on_texts(x_tr)

'''
Now that we have encoded every token to an integer, let's convert the text sequences to integer sequences. After that we will pad the integer sequences to the maximum sequence length, i.e., 100.


'''


25315
12575


"\nNow that we have encoded every token to an integer, let's convert the text sequences to integer sequences. After that we will pad the integer sequences to the maximum sequence length, i.e., 100.\n\n\n"

In [14]:
#define threshold for maximum length of a setence
max_len=100

#convert text sequences into integer sequences
x_tr_seq = x_tokenizer.texts_to_sequences(x_tr) 
x_val_seq = x_tokenizer.texts_to_sequences(x_val)

#padding up with zero 
x_tr_seq = pad_sequences(x_tr_seq,  padding='post', maxlen=max_len)
x_val_seq = pad_sequences(x_val_seq, padding='post', maxlen=max_len)

#Since we are padding the sequences with zeros, 
#we must increment the vocabulary size by one.
#no. of unique words
x_voc_size = x_tokenizer.num_words + 1

print(x_voc_size)

print(x_tr_seq[0])


12576
[1953 5711  416 2023    1  226 1747 3740  609   43  181 1953  372   19
  100  416    9 1747 3839  238   27   27   27   27   27   70    6 6919
    8 1163   70    6   43   43 1802 1802 1802   36   36   36   36 4308
 5410    4  124  592  107   22    2 1747 4065   27   10 1309   10 6415
   10  190   10  416   10   27   10 1309   10 6415   10  190   10  416
   10  456  139   15    7    2 4610  164   27   10 1309   10 6415   10
  190   10  416   10   27   76   27 1309   76   27 6415   76   27  190
   76   27]


In [15]:
# Model Building
from keras.models import *
from keras.layers import *
from keras.callbacks import *
import keras.backend as k

### Define Model Architecture
# define model architecture
k.clear_session()
model =  Sequential()
model.add(Embedding(x_voc_size, 50, trainable=True, input_shape=(max_len,)))  #embedding layer
  
model.add(Conv1D(64,3,padding='same'))  #conv1d layer
model.add(Dropout(0.1))

model.add(GlobalMaxPooling1D()) 
  
model.add(Dense(128,activation='relu'))  #dense layer

model.add(Dense(10,activation='sigmoid')) #output layer
model.summary() #summary) of model


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 50)           628800    
                                                                 
 conv1d (Conv1D)             (None, 100, 64)           9664      
                                                                 
 dropout (Dropout)           (None, 100, 64)           0         
                                                                 
 global_max_pooling1d (Glob  (None, 64)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 10)                1290      
                                                        

In [16]:
#define optimizer and loss
model.compile(optimizer='adam',loss='binary_crossentropy')

#checkpoint to save best model during training
mc = ModelCheckpoint("weights.best.hdf5", monitor='val_loss', verbose=1, save_best_only=True, mode='min')

### Train the Model
#train the model 
model.fit(x_tr_seq, y_tr, batch_size=128, epochs=10, verbose=1, validation_data=(x_val_seq, y_val), callbacks=[mc])

# Model Predictions 
# load weights into new model
model.load_weights("weights.best.hdf5")

#predict probabilities
pred_prob = model.predict(x_val_seq)

print(pred_prob[0])


Epoch 1/10
Epoch 1: val_loss improved from inf to 0.47185, saving model to weights.best.hdf5
Epoch 2/10
13/70 [====>.........................] - ETA: 0s - loss: 0.4666

  saving_api.save_model(


Epoch 2: val_loss improved from 0.47185 to 0.37552, saving model to weights.best.hdf5
Epoch 3/10
Epoch 3: val_loss improved from 0.37552 to 0.30594, saving model to weights.best.hdf5
Epoch 4/10
Epoch 4: val_loss improved from 0.30594 to 0.28040, saving model to weights.best.hdf5
Epoch 5/10
Epoch 5: val_loss improved from 0.28040 to 0.27012, saving model to weights.best.hdf5
Epoch 6/10
Epoch 6: val_loss improved from 0.27012 to 0.26541, saving model to weights.best.hdf5
Epoch 7/10
Epoch 7: val_loss did not improve from 0.26541
Epoch 8/10
Epoch 8: val_loss did not improve from 0.26541
Epoch 9/10
Epoch 9: val_loss did not improve from 0.26541
Epoch 10/10
Epoch 10: val_loss did not improve from 0.26541
[0.05632282 0.03528995 0.02115313 0.04292109 0.13188846 0.01419459
 0.97529477 0.21525486 0.03452519 0.26283965]


In [17]:

'''
The predictions are in terms of probabilities for each of the 10 tags. Hence we need to have a threshold value to convert these probabilities to 0 or 1.

Let's specify a set of candidate threshold values. We will select the threshold value that performs the best for the validation set.

'''

#define candidate threshold values
threshold  = np.arange(0,0.5,0.01)
print(threshold)


[0.   0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1  0.11 0.12 0.13
 0.14 0.15 0.16 0.17 0.18 0.19 0.2  0.21 0.22 0.23 0.24 0.25 0.26 0.27
 0.28 0.29 0.3  0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39 0.4  0.41
 0.42 0.43 0.44 0.45 0.46 0.47 0.48 0.49]


In [18]:

'''
Let's define a function that takes a threshold value and uses it to convert probabilities into 1 or 0.

'''

# convert probabilities into classes or tags based on a threshold value
def classify(pred_prob,thresh):
  y_pred_seq = []

  for i in pred_prob:
    temp=[]
    for j in i:
      if j>=thresh:
        temp.append(1)
      else:
        temp.append(0)
    y_pred_seq.append(temp)

  return y_pred_seq


In [19]:

from sklearn import metrics
score=[]

#convert to 1 array
y_true = np.array(y_val).ravel() 

for thresh in threshold:
    
    #classes for each threshold
    y_pred_seq = classify(pred_prob,thresh) 

    #convert to 1d array
    y_pred = np.array(y_pred_seq).ravel()

    score.append(metrics.f1_score(y_true,y_pred))

 # find the optimal threshold
opt = threshold[score.index(max(score))]
print(opt)



0.39


In [20]:
# Model Evaluation
#predictions for optimal threshold
y_pred_seq = classify(pred_prob,opt)
y_pred = np.array(y_pred_seq).ravel()

print(metrics.classification_report(y_true,y_pred))

y_pred = mlb.inverse_transform(np.array(y_pred_seq))
y_true = mlb.inverse_transform(np.array(y_val))

df = pd.DataFrame({'comment':x_val,'actual':y_true,'predictions':y_pred})

print(df.sample(10))

# Inference
def predict_tag(comment):  
  text=[]

  #preprocess  
  text = [cleaner(comment)]

  #convert to integer sequences
  seq = x_tokenizer.texts_to_sequences(text)

  #pad the sequence
  pad_seq = pad_sequences(seq,  padding='post', maxlen=max_len)

  #make predictions
  pred_prob = model.predict(pad_seq)
  classes = classify(pred_prob,opt)[0]
  
  classes = np.array([classes])
  classes = mlb.inverse_transform(classes)  
  return classes

comment = "For example, in the case of logistic regression, the learning function is a Sigmoid function that tries to separate the 2 classes"

print("Comment:",comment)
print("Predicted Tags:",predict_tag(comment))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93     17520
           1       0.74      0.74      0.74      4700

    accuracy                           0.89     22220
   macro avg       0.84      0.84      0.84     22220
weighted avg       0.89      0.89      0.89     22220

                                                                                                                                                                                                      comment  \
557   suppose i ve done an experiment and i have a distribution of observations x that vary between pi and pi now suppose each x is associated with a second observation y that may or may not influence t...   
25    first cross validated question so please be gentle o i have two datasets all gathered and managed in r dataset news corpus contains entries from the period apr to mar there are often multiple stor...   
1412  i am looking for an r package that will 