# In this notebook, non deep learning classification models are trained on linguistic features of the data like POS, Named Entity, Syntactic Dependency etc.
- The best accuracy on the test set is **0.87** using Linear SVC. The accuracy using bert is **0.978**.

In [1]:
import pandas as pd
import random
import os
import time
import numpy as np

def set_seed(seed):
    """Set seed"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(2020) # for reproducibility of random modules if used

# Data Loading and Preprocessing

In [2]:
f_train = open('data/train_5500.txt', 'r+')
f_test = open('data/TREC_10.txt', 'r+')

train = pd.DataFrame(f_train.readlines(), columns = ['Question'])
test = pd.DataFrame(f_test.readlines(), columns = ['Question'])

train['QType'] = train.Question.apply(lambda x: x.split(' ', 1)[0])
train['Question'] = train.Question.apply(lambda x: x.split(' ', 1)[1])
train['QType-Coarse'] = train.QType.apply(lambda x: x.split(':')[0])
train['QType-Fine'] = train.QType.apply(lambda x: x.split(':')[1])
test['QType'] = test.Question.apply(lambda x: x.split(' ', 1)[0])
test['Question'] = test.Question.apply(lambda x: x.split(' ', 1)[1])
test['QType-Coarse'] = test.QType.apply(lambda x: x.split(':')[0])
test['QType-Fine'] = test.QType.apply(lambda x: x.split(':')[1])

In [3]:
train.head()

Unnamed: 0,Question,QType,QType-Coarse,QType-Fine
0,How did serfdom develop in and then leave Russ...,DESC:manner,DESC,manner
1,What films featured the character Popeye Doyle...,ENTY:cremat,ENTY,cremat
2,How can I find a list of celebrities ' real na...,DESC:manner,DESC,manner
3,What fowl grabs the spotlight after the Chines...,ENTY:animal,ENTY,animal
4,What is the full form of .com ?\n,ABBR:exp,ABBR,exp


In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(pd.Series(train['QType-Coarse'].tolist() + test['QType-Coarse'].tolist()).values)
train['QType-Coarse'] = le.transform(train['QType-Coarse'].values)
test['QType-Coarse'] = le.transform(test['QType-Coarse'].values)
train.head()

Unnamed: 0,Question,QType,QType-Coarse,QType-Fine
0,How did serfdom develop in and then leave Russ...,DESC:manner,1,manner
1,What films featured the character Popeye Doyle...,ENTY:cremat,2,cremat
2,How can I find a list of celebrities ' real na...,DESC:manner,1,manner
3,What fowl grabs the spotlight after the Chines...,ENTY:animal,2,animal
4,What is the full form of .com ?\n,ABBR:exp,0,exp


In [5]:
import re, nltk
import gensim
import codecs
import spacy 
from sklearn.metrics import confusion_matrix, accuracy_score, average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.internals import find_jars_within_path
from nltk.tag import StanfordPOSTagger
from nltk.tag import StanfordNERTagger
from sklearn import linear_model
from sklearn import svm
from sklearn.metrics import fbeta_score, accuracy_score
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mila/g/gampapha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mila/g/gampapha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
all_corpus = pd.Series(train.Question.tolist() + test.Question.tolist()).astype(str)
dot_words = []
for row in all_corpus:
    for word in row.split():
        if '.' in word and len(word)>2:
            dot_words.append(word)

In [7]:
from collections import Counter
Counter(dot_words)

Counter({'.com': 1,
         'St.': 17,
         'U.S.': 151,
         'J.R.R.': 2,
         'T.V.': 1,
         'S.O.S.': 1,
         'No.1': 1,
         'Answers.com': 5,
         'U.K.': 2,
         'Mrs.': 6,
         'G.M.T.': 1,
         'www.answers.com': 1,
         'A.G.': 1,
         '.dbf': 1,
         'www.questions.com': 1,
         'KnowPost.com': 1,
         'L.A.': 4,
         'W.B.': 1,
         'D.B.': 1,
         '42.3': 1,
         'Dr.': 4,
         'Rev.': 1,
         'Jan.': 1,
         'Jr.': 4,
         'No.': 3,
         'Mr.': 6,
         'Ms.': 2,
         'Inc.': 3,
         'cwt.': 1,
         'U.S.A.': 3,
         'etc.': 3,
         'question..': 1,
         'D.C.': 6,
         'N.M': 1,
         'e.g.': 2,
         '...the': 1,
         'q.i.d': 1,
         'aol.com': 1,
         'yahoo.com': 1,
         'J.D.': 1,
         'LL.M.': 1,
         'U.S.S.R.': 1,
         'creativity.': 1,
         'a.m.': 1,
         'p.m.': 3,
         '...': 1,
         

In [8]:
def text_clean(corpus, keep_list):
    '''
    Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [13]:
all_corpus = pd.Series(train.Question.tolist() + test.Question.tolist()).astype(str)
common_dot_words = ['U.S.', 'St.', 'Mr.', 'Mrs.', 'D.C.']   #including most frequent dot words
all_corpus = text_clean(all_corpus,common_dot_words)
wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']  # all of them are questions, so including these
stop = set(stopwords.words('english'))
for word in wh_words:
    stop.remove(word)
all_corpus = [[x for x in x.split() if x not in stop] for x in all_corpus]
all_corpus = [' '.join(x) for x in all_corpus]
train_corpus = all_corpus[0:train.shape[0]]
test_corpus = all_corpus[train.shape[0]:]



  cleaned_corpus = pd.Series()


# Linguistic Feature Extraction

In [10]:
nlp = spacy.load("en_core_web_sm")

In [14]:
all_ner = []
all_lemma = []
all_tag = []
all_dep = []
all_shape = []
for row in train_corpus:
    doc = nlp(row)
    present_lemma = []
    present_tag = []
    present_dep = []
    present_shape = []
    present_ner = []
    #print(row)
    for token in doc:
        present_lemma.append(token.lemma_)
        present_tag.append(token.tag_)
        #print(present_tag)
        present_dep.append(token.dep_)
        present_shape.append(token.shape_)
    all_lemma.append(" ".join(present_lemma))
    all_tag.append(" ".join(present_tag))
    all_dep.append(" ".join(present_dep))
    all_shape.append(" ".join(present_shape))
    for ent in doc.ents:
        present_ner.append(ent.label_)
    all_ner.append(" ".join(present_ner))

    
count_vec_ner = CountVectorizer(ngram_range=(1, 2)).fit(all_ner)
ner_ft = count_vec_ner.transform(all_ner)
count_vec_lemma = CountVectorizer(ngram_range=(1, 2)).fit(all_lemma)
lemma_ft = count_vec_lemma.transform(all_lemma)
count_vec_tag = CountVectorizer(ngram_range=(1, 2)).fit(all_tag)
tag_ft = count_vec_tag.transform(all_tag)
count_vec_dep = CountVectorizer(ngram_range=(1, 2)).fit(all_dep)
dep_ft = count_vec_dep.transform(all_dep)
count_vec_shape = CountVectorizer(ngram_range=(1, 2)).fit(all_shape)
shape_ft = count_vec_shape.transform(all_shape)




In [15]:
x_all_ft_train = hstack([ner_ft, lemma_ft, tag_ft, dep_ft, shape_ft])

x_all_ft_train= x_all_ft_train.tocsr()

In [16]:
all_test_ner = []
all_test_lemma = []
all_test_tag = []
all_test_dep = []
all_test_shape = []
for row in test_corpus:
    doc = nlp(row)
    present_lemma = []
    present_tag = []
    present_dep = []
    present_shape = []
    present_ner = []
    #print(row)
    for token in doc:
        present_lemma.append(token.lemma_)
        present_tag.append(token.tag_)
        #print(present_tag)
        present_dep.append(token.dep_)
        present_shape.append(token.shape_)
    all_test_lemma.append(" ".join(present_lemma))
    all_test_tag.append(" ".join(present_tag))
    all_test_dep.append(" ".join(present_dep))
    all_test_shape.append(" ".join(present_shape))
    for ent in doc.ents:
        present_ner.append(ent.label_)
    all_test_ner.append(" ".join(present_ner))

In [17]:
ner_test_ft = count_vec_ner.transform(all_test_ner)
lemma_test_ft = count_vec_lemma.transform(all_test_lemma)
tag_test_ft = count_vec_tag.transform(all_test_tag)
dep_test_ft = count_vec_dep.transform(all_test_dep)
shape_test_ft = count_vec_shape.transform(all_test_shape)

In [18]:
x_all_ft_test = hstack([ner_test_ft, lemma_test_ft, tag_test_ft, dep_test_ft, shape_test_ft]).tocsr()

# Model Training
## Linear SVC

In [19]:
from sklearn.metrics import f1_score, classification_report

from sklearn.svm import LinearSVC


model = LinearSVC()

t0 = time.time()
model.fit(x_all_ft_train, train['QType-Coarse'].values)
t1 = time.time()
y_pred = model.predict(x_all_ft_test)
t2 = time.time()
time_train = t1-t0
time_predict = t2-t1


print("Results for LinearSVC")
print("Training time: %fs; Prediction time: %fs" % (time_train, time_predict))
print(classification_report(test['QType-Coarse'].values, y_pred))

Results for SVC
Training time: 1.393968s; Prediction time: 0.004478s
              precision    recall  f1-score   support

           0       1.00      0.78      0.88         9
           1       0.81      0.96      0.88       138
           2       0.86      0.71      0.78        94
           3       0.87      0.95      0.91        65
           4       0.89      0.86      0.87        81
           5       0.98      0.87      0.92       113

    accuracy                           0.87       500
   macro avg       0.90      0.86      0.87       500
weighted avg       0.88      0.87      0.87       500





## Xgboost

In [23]:
from xgboost import XGBClassifier


model = XGBClassifier()

t0 = time.time()
model.fit(x_all_ft_train, train['QType-Coarse'].values)
t1 = time.time()
y_pred = model.predict(x_all_ft_test)
t2 = time.time()
time_train = t1-t0
time_predict = t2-t1


print("Results for xgboost")
print("Training time: %fs; Prediction time: %fs" % (time_train, time_predict))
print(classification_report(test['QType-Coarse'].values, y_pred))

Results for SVC
Training time: 5.512071s; Prediction time: 0.013048s
              precision    recall  f1-score   support

           0       1.00      0.78      0.88         9
           1       0.81      0.99      0.89       138
           2       0.76      0.72      0.74        94
           3       0.86      0.88      0.87        65
           4       0.86      0.80      0.83        81
           5       0.98      0.81      0.89       113

    accuracy                           0.85       500
   macro avg       0.88      0.83      0.85       500
weighted avg       0.86      0.85      0.85       500



## Support Vector Classifier (non-linear kernel)

In [24]:
from sklearn.svm import SVC


model = SVC()

t0 = time.time()
model.fit(x_all_ft_train, train['QType-Coarse'].values)
t1 = time.time()
y_pred = model.predict(x_all_ft_test)
t2 = time.time()
time_train = t1-t0
time_predict = t2-t1


print("Results for SVC")
print("Training time: %fs; Prediction time: %fs" % (time_train, time_predict))
print(classification_report(test['QType-Coarse'].values, y_pred))


Results for SVC
Training time: 6.073678s; Prediction time: 0.387457s
              precision    recall  f1-score   support

           0       1.00      0.22      0.36         9
           1       0.69      0.98      0.81       138
           2       0.56      0.70      0.63        94
           3       0.91      0.78      0.84        65
           4       0.83      0.60      0.70        81
           5       1.00      0.62      0.77       113

    accuracy                           0.75       500
   macro avg       0.83      0.65      0.68       500
weighted avg       0.79      0.75      0.74       500

