In [1]:
import os, sys, itertools

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, make_scorer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk import pos_tag
from nltk.tokenize import TweetTokenizer, word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer

from autocorrect import spell

In [2]:
import xml.etree.ElementTree as ET

class XMLParser:
    """Parser assumes the first level of xml tags are to be transformed
    to rows in a Pandas dataframe. For each of the first-level tags it takes
    all of their subtags and attributes, and puts them as columns to
    the current row in the dataframe."""

    def __init__(self, xml_data):
        self.root = ET.XML(xml_data)

    def parse_root(self, root):
        """Return a list of dictionaries from the text
         and attributes of the children under this XML root."""
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        """ Collect {key:attribute} and {tag:text} from the XML
         element and all its children into a single dictionary of strings."""

        if parsed is None:
            parsed = dict()

        if element.tag == "RelComment":
            return parsed

        for key in element.keys():
            if key not in parsed:
                parsed[key] = element.attrib.get(key)
            else:
                raise ValueError('Duplicate attribute {0} = {1}, prev = {2}' \
                                 .format(key,
                                         element.attrib.get(key),
                                         parsed[key]))

        for child in iter(element):
            if child.tag == "RelQSubject" or child.tag == "RelQBody":
                parsed[child.tag] = child.text
            else:
                self.parse_element(child, parsed)

        return parsed

    def to_df(self):
        """ Initiate the root XML, parse it, and return a dataframe"""
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)


In [99]:
def df_from_xml_file(filename):
    with open(filename, 'r') as content_file:
        content = content_file.read()

    xml = XMLParser(content)
    xml_df = xml.to_df()
    return xml_df

def make_vocab(docs, tokenizer):
    vocab = set()
    for doc in docs:
        tokenized_doc = tokenizer.tokenize(doc)
        vocab.update(tokenized_doc)
    return list(vocab)


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues,
                          fig_size=(10, 7)):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    #print(cm)

    plt.figure(figsize=fig_size)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


def make_data(unstuctured_df, tmap):
    df = pd.DataFrame({
        'id': unstuctured_df.THREAD_SEQUENCE,
        'subject': unstuctured_df.RelQSubject,
        'question': unstuctured_df.RelQBody,
        'type': unstuctured_df.RELQ_FACT_LABEL,
    })
    df.question = df.question.fillna("")
    df['question_and_subj'] = df.subject + " "  + df.question
    
    if tmap == None:
        return df.question, None, df

    y = df.type.transform(lambda z: tmap[z])
    return df.question_and_subj, y, df

def save_submission(filename, predict_df):
    """Save predictions from given dataframe as expected from the grader"""
    with open(filename, 'w') as f:
        for i in range(predict_df.shape[0]):
            f.write("{}\t{}\n".format(predict_df.id[i], predict_df.pred[i]))

In [105]:
import re
from autocorrect import spell

tokenizer = TweetTokenizer()
stemmer = SnowballStemmer('english')

class Word:
    def __init__(self, word, tag):
        self.word = word
        self.tag = tag
    
    def __hash__(self):
        return hash((self.word, self.tag))
    
    def __lt__(self, other):
        return self.word < other.word

    def __eq__(self, other):
        return self.word == other.word and self.tag == other.tag


len_stops = set(stopwords.words('english'))
stops = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# def tweet_tokenize(doc):
#     return np.array([stemmer.stem(word) for word in tokenizer.tokenize(doc)])

# def pos_tag_tokenize(doc):
#     sentances = [word_tokenize(sent) for sent in sent_tokenize(doc)]
#     tagged = pos_tag([stemmer.stem(word) for sent in sentances for word in sent])
#     return np.array([ Word(word, tag) for word, tag in tagged])

def transform(word):
    no_num = re.sub(r'[0-9,.]*[0-9]', '__NUMBER__', word)
    no_url = re.sub(r'https?://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '__URL__', no_num)
    punct = re.sub(r'([,.!?;:-])+', '\1', no_url)
    return stemmer.stem(punct)

# def transform_no_stem(word):
#     no_num = re.sub(r'[0-9,.]*[0-9]', '__NUMBER__', word)
#     no_url = re.sub(r'https?://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '__URL__', no_num)
#     punct = re.sub(r'([,.!?;:-])+', '\1', no_url)
#     return punct

# def transform_lemmatize(word):
#     no_num = re.sub(r'[0-9,.]*[0-9]', '__NUMBER__', word)
#     no_url = re.sub(r'https?://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '__URL__', no_num)
#     punct = re.sub(r'([,.!?;:-])+', '\1', no_url)
#     return lemmatizer.lemmatize(punct)

def tokenize(doc):
    sentances = [TweetTokenizer().tokenize(sent) for sent in sent_tokenize(doc)]
    return np.array([transform(word.lower()) for sent in sentances for word in sent])

In [106]:
question_train_filename = 'questions_train.xml'
types_map = { 'Opinion': 0, 'Factual': 1, 'Socializing': 2}

question_train_df = df_from_xml_file(question_train_filename)
x, y, df = make_data(question_train_df, types_map)

In [107]:
# pd.set_option('max_colwidth', 150)
pd.set_option('display.max_colwidth', 0)


In [108]:
clf = Pipeline([
    ('vectorizer',  TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 2))),
    ('classifier',  SVC(C=1, kernel='linear')),
])

In [109]:
cross_validate(clf, x, y, cv=5, scoring='accuracy', return_train_score=True)

{'fit_time': array([1.83399677, 1.80956006, 1.93018913, 2.08542609, 2.04704475]),
 'score_time': array([0.41421986, 0.44018888, 0.39433599, 0.48281193, 0.4853754 ]),
 'test_score': array([0.64444444, 0.6875    , 0.63839286, 0.60538117, 0.72522523]),
 'train_score': array([0.99552072, 0.99105145, 0.98993289, 0.99329609, 0.9921875 ])}

In [112]:
import numpy as np
import pandas as pd
import csv
# from tqdm import tqdm

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, Dropout
from keras.models import Model
from keras.regularizers import l1, l2
from keras.initializers import Constant, RandomNormal, RandomUniform
from keras.metrics import categorical_accuracy
from keras.optimizers import Adam

Using TensorFlow backend.


In [110]:
MAX_NUM_WORDS=3000
EMBEDDING_DIM=100
MAX_SEQUENCE_LENGTH=120

In [115]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 6145 unique tokens.


In [116]:
data

array([[   0,    0,    0, ..., 1222,    4,   11],
       [   0,    0,    0, ..., 1976,   20, 2950],
       [   0,    0,    0, ..., 1225,  411,  131],
       ...,
       [   0,    0,    0, ...,    9,  681,  483],
       [   0,    0,    0, ..., 1526,  124,   12],
       [   0,    0,    0, ...,    9,    3,  871]], dtype=int32)

In [120]:
tokenizer.word_index

{'i': 1,
 'to': 2,
 'the': 3,
 'in': 4,
 'a': 5,
 'and': 6,
 'is': 7,
 'for': 8,
 'of': 9,
 'my': 10,
 'qatar': 11,
 'you': 12,
 'it': 13,
 'can': 14,
 'have': 15,
 'me': 16,
 'doha': 17,
 'this': 18,
 'are': 19,
 'any': 20,
 'what': 21,
 'that': 22,
 'on': 23,
 'or': 24,
 'with': 25,
 'do': 26,
 'but': 27,
 'know': 28,
 'there': 29,
 'from': 30,
 'be': 31,
 'if': 32,
 'anyone': 33,
 'not': 34,
 'am': 35,
 'thanks': 36,
 'please': 37,
 'how': 38,
 'will': 39,
 'here': 40,
 'visa': 41,
 'where': 42,
 'so': 43,
 'get': 44,
 'all': 45,
 'one': 46,
 'good': 47,
 'as': 48,
 'just': 49,
 'we': 50,
 'about': 51,
 'they': 52,
 'your': 53,
 'at': 54,
 'would': 55,
 'hi': 56,
 'some': 57,
 'like': 58,
 'best': 59,
 'help': 60,
 'was': 61,
 'need': 62,
 'an': 63,
 'does': 64,
 'which': 65,
 'school': 66,
 'want': 67,
 'car': 68,
 "i'm": 69,
 'out': 70,
 'now': 71,
 'who': 72,
 'go': 73,
 'been': 74,
 'time': 75,
 'new': 76,
 'company': 77,
 '2': 78,
 'no': 79,
 'family': 80,
 'people': 81,
 'find

In [117]:
labels = to_categorical(y)

print('Labels:', labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Labels: [[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]]
Shape of data tensor: (1118, 120)
Shape of label tensor: (1118, 3)


In [119]:
print('Indexing word vectors.')

embeddings_index = {}
with open('./glove.twitter.27B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 1193515 word vectors.


In [131]:
embeddings_index['in']

array([-0.091552,  0.55193 ,  0.6166  ,  0.52463 ,  0.58652 ,  0.20113 ,
       -0.55163 , -0.30853 ,  0.24079 ,  0.059981,  0.27608 ,  0.56519 ,
       -5.5562  , -0.036084, -0.17879 , -0.90797 , -0.13528 ,  0.55302 ,
       -1.5412  , -0.27153 ,  0.045949, -0.27613 , -0.031604,  0.29257 ,
        0.17782 , -0.45742 , -0.17257 , -0.25651 , -0.7679  ,  0.31588 ,
       -0.36951 ,  0.57035 , -0.15254 ,  0.42258 ,  0.81202 , -0.15159 ,
        0.39439 ,  0.41327 , -0.19144 , -0.45827 , -1.5744  , -0.25215 ,
       -0.62427 ,  0.11973 ,  0.35804 , -0.037619, -0.07418 ,  0.38664 ,
       -0.41619 ,  0.10846 , -0.066371, -0.62441 , -0.16464 ,  0.28154 ,
       -0.47376 , -0.54616 , -0.1153  , -1.0872  , -0.32708 ,  0.17912 ,
       -0.81835 ,  0.45268 , -0.71417 , -0.2947  ,  0.036828, -0.32437 ,
        0.22164 , -0.46105 ,  0.24221 , -0.18038 , -0.073568,  0.07334 ,
        0.011495, -0.050368, -0.010352,  0.39953 , -1.0918  ,  0.074331,
        0.69542 , -0.28939 ,  1.7249  , -0.46104 , 

In [125]:
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [130]:
embedding_matrix[4]

array([-0.091552  ,  0.55193001,  0.61659998,  0.52463001,  0.58652002,
        0.20113   , -0.55163002, -0.30853   ,  0.24078999,  0.059981  ,
        0.27608001,  0.56519002, -5.55620003, -0.036084  , -0.17879   ,
       -0.90797001, -0.13528   ,  0.55302   , -1.54120004, -0.27153   ,
        0.045949  , -0.27612999, -0.031604  ,  0.29256999,  0.17782   ,
       -0.45741999, -0.17257001, -0.25650999, -0.76789999,  0.31588   ,
       -0.36950999,  0.57034999, -0.15254   ,  0.42258   ,  0.81202   ,
       -0.15159   ,  0.39438999,  0.41327   , -0.19144   , -0.45827001,
       -1.57439995, -0.25215   , -0.62427002,  0.11973   ,  0.35804   ,
       -0.037619  , -0.07418   ,  0.38664001, -0.41619   ,  0.10846   ,
       -0.066371  , -0.62440997, -0.16463999,  0.28154001, -0.47376001,
       -0.54615998, -0.1153    , -1.08720005, -0.32708001,  0.17912   ,
       -0.81835002,  0.45267999, -0.71416998, -0.2947    ,  0.036828  ,
       -0.32437   ,  0.22164001, -0.46105   ,  0.24221   , -0.18

In [164]:
means = np.array()
for question in data:
    token_embeddings = []
    for token in question:
        if (embedding_matrix[token] == np.zeros(100,)).all() == False:
#             print(embedding_matrix[token])
            token_embeddings.append(embedding_matrix[token])
    means.append(np.mean(token_embeddings, axis=0))

TypeError: Required argument 'object' (pos 1) not found

In [170]:
x = np.array(means)

In [172]:
x

array([[-0.21315445, -0.00506486, -0.07779886, ...,  0.01757114,
         0.18729528,  0.08796043],
       [ 0.17079838,  0.21765135, -0.04017672, ..., -0.06864561,
         0.19852009,  0.00193353],
       [ 0.04366518,  0.15912814,  0.10475395, ..., -0.07309307,
         0.17919158, -0.02207311],
       ...,
       [ 0.25561467,  0.14070878, -0.04334215, ...,  0.06570112,
         0.30671034,  0.22353886],
       [ 0.07406755,  0.00770749,  0.14408441, ..., -0.13346674,
         0.29477677,  0.07380002],
       [ 0.09447041,  0.03711973,  0.05677721, ..., -0.14254261,
         0.26877032,  0.13957107]])

In [175]:
y

0       1
1       0
2       0
3       0
4       0
5       2
6       0
7       1
8       0
9       1
10      1
11      1
12      1
13      1
14      0
15      1
16      0
17      1
18      1
19      0
20      1
21      0
22      2
23      0
24      0
25      0
26      0
27      2
28      2
29      0
       ..
1088    0
1089    0
1090    0
1091    0
1092    0
1093    0
1094    1
1095    0
1096    0
1097    0
1098    0
1099    2
1100    2
1101    2
1102    0
1103    2
1104    1
1105    2
1106    0
1107    1
1108    0
1109    2
1110    2
1111    2
1112    2
1113    2
1114    2
1115    2
1116    0
1117    0
Name: type, Length: 1118, dtype: int64

In [176]:
x.shape, y.shape

((1118, 100), (1118,))

In [200]:
svc = SVC(C=100, kernel='linear')

In [201]:
cross_validate(svc, x, y, cv=5, scoring='accuracy', return_train_score=True)

{'fit_time': array([0.62037921, 0.83154464, 1.16482401, 0.60248017, 1.2147069 ]),
 'score_time': array([0.0121038 , 0.01271915, 0.01145983, 0.01214004, 0.01291704]),
 'test_score': array([0.60888889, 0.62053571, 0.59375   , 0.50672646, 0.57657658]),
 'train_score': array([0.74804031, 0.75615213, 0.76957494, 0.77541899, 0.75669643])}

:(

In [194]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators = 100, max_depth = 50)

In [195]:
cross_validate(random_forest, x, y, cv=5, scoring='accuracy', return_train_score=True)

{'fit_time': array([0.51194501, 0.49273705, 0.48470807, 0.49082994, 0.46658206]),
 'score_time': array([0.009027  , 0.01057601, 0.01010299, 0.00852013, 0.00801897]),
 'test_score': array([0.64      , 0.61160714, 0.58035714, 0.53811659, 0.63513514]),
 'train_score': array([1., 1., 1., 1., 1.])}