# Get Answer-Type index

In [1]:
# f = open('.\Data\ATD_train.txt')
import platform




def getAnswerTypeIndex(filename):
    f = open(filename)
    if platform.system() == 'Windows':
        lines = f.read().split('\n')
    else:
        lines = f.read().split('\r\n')
    answer_types = {}
    for line in lines:
        answer_types[line.split(' ')[1]] = int(line.split(' ')[0])
        answer_types[int(line.split(' ')[0])] = line.split(' ')[1]
    f.close()
    return answer_types

answer_types = getAnswerTypeIndex('AnswerType.txt')

# Get train-question and its type

In [2]:
import re
import nltk
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!\']", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

def getPOSTag(sentences):
    tags = nltk.pos_tag(nltk.word_tokenize(sentences))
    tags_sequence = ""
    for tag in tags:
        tags_sequence += tag[1] + " "
    return tags_sequence
def token(s):
    return s.split()

In [3]:
newline = ''
if platform.system() == 'Windows':
    newline = '\n'
else:
    newline = '\r\n'
f = open('./Data/ATD_train.txt')
lines = f.read().replace('\xef\xbb\xbf','').split(newline)
questions = []
questions_POSTag = []
questions_type = []

for line in lines:
    qt = line[0 : line.index(' ')]
    q = line[line.index(' ') + 1 : -1]
    questions.append(clean_str(q))
    questions_POSTag.append(getPOSTag(q))
    questions_type.append(answer_types[qt])
print questions_POSTag[0]

WP NN VBD DT NN IN DT JJ NN IN DT NNP 


In [4]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

# Building Vocabulary for Unigram and UniPOS

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv_Word = CountVectorizer(ngram_range = (1,1))
arr_features = cv_Word.fit_transform(questions).toarray()

cv_Pos = CountVectorizer(lowercase = False,tokenizer = token, ngram_range = (1,1))
array_Pos = cv_Pos.fit_transform(questions_POSTag).toarray()

# Save Vocabulary and POS
pickle.dump(cv_Word.vocabulary_,open('Unigram_Vocabulary.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(cv_Pos.vocabulary_,open('UniPOS_List.pkl','wb'),pickle.HIGHEST_PROTOCOL)

# Feature Extraction

In [6]:
import numpy as np
def feature_extraction(question,CV_Unigram,CV_UniPOS):
    if (type(question) is str):
        question_POSTag = getPOSTag(question)
        arr_feature = CV_Unigram.transform(question).toarray()
        arr_feature += CV_UniPOS.transform(question_POSTag).toarray()
    else:
        question_POSTag = [getPOSTag(q) for q in question]
        arr_feature = np.array(CV_Unigram.transform(question).toarray())
        arr_feature = np.hstack((arr_feature,np.array(CV_UniPOS.transform(question_POSTag).toarray())))
    return np.array(arr_feature)

In [7]:
word_voca = pickle.load(open('Unigram_Vocabulary.pkl','rb'))
pos_list = pickle.load(open('UniPOS_List.pkl','rb'))

CV_Unigram = CountVectorizer(vocabulary = word_voca, ngram_range = (1,1))
CV_UniPOS = CountVectorizer(vocabulary = pos_list,lowercase = False,tokenizer = token,ngram_range = (1,1))

train_question = feature_extraction(questions,CV_Unigram,CV_UniPOS)
train_label = questions_type

# Training 

In [8]:
import sklearn.svm as SVM
model = SVM.LinearSVC()
model.fit(train_question,train_label)

#Saving model 
pickle.dump(model,open('ATD_Model.pkl','wb'))

# Testing

In [9]:
from sklearn.metrics import accuracy_score
f = open('./Data/ATD_test.txt')
lines = f.read().replace('\xef\xbb\xbf','').split(newline)
test_questions = []
test_questions_type = []

for line in lines:
    qt = line[0 : line.index(' ')]
    q = line[line.index(' ') + 1 : -1]
    test_questions.append(clean_str(q))
    test_questions_type.append(answer_types[qt])

test_question = feature_extraction(test_questions,CV_Unigram,CV_UniPOS)
test_label = test_questions_type

model2 = pickle.load(open('ATD_Model.pkl','rb'))
accuracy_score(test_label,model2.predict(test_question))

0.9137254901960784

In [10]:
from sklearn.metrics import confusion_matrix
a = confusion_matrix(test_label, model2.predict(test_question))
print a

[[55  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 75  0  0  0  3  0  1  1  0  0  0  0  0  0  0  0]
 [ 0  1  5  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  3  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 47  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  8  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  2  0  0  0  0 10  0  0  0  2  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 10  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  5  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  4  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0]
 [ 0  3  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2]]


In [11]:
f.close()