## Import Libraries

In [73]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

## Dataset Preparation

In [121]:
# load the dataset
data = open('data/corpus').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(content[1:])

# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

In [122]:
trainDF = trainDF.astype('str') 

In [123]:
trainDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
text     10000 non-null object
label    10000 non-null object
dtypes: object(2)
memory usage: 156.3+ KB


In [124]:
trainDF.head()

Unnamed: 0,text,label
0,"['Stuning', 'even', 'for', 'the', 'non-gamer:'...",__label__2
1,"['The', 'best', 'soundtrack', 'ever', 'to', 'a...",__label__2
2,"['Amazing!:', 'This', 'soundtrack', 'is', 'my'...",__label__2
3,"['Excellent', 'Soundtrack:', 'I', 'truly', 'li...",__label__2
4,"['Remember,', 'Pull', 'Your', 'Jaw', 'Off', 'T...",__label__2


In [125]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], random_state=1708)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

## Feature Engineering

#### 1. Count Vectors

In [126]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [127]:
# transform the training and validation data using count vectorizer object
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)

#### 2. TF-IDF Vectors

In [128]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)

In [129]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x)

In [130]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(valid_x) 

#### 3. Word Embeddings

In [39]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('data/wiki-news-300d-1M.vec')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

FileNotFoundError: [Errno 2] No such file or directory: 'data/wiki-news-300d-1M.vec'

#### 4. Text / NLP based features

In [40]:
trainDF['char_count'] = trainDF['text'].apply(len)
trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [41]:
trainDF.head()

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count
0,"['Stuning', 'even', 'for', 'the', 'non-gamer:'...",__label__2,667,80,8.234568,252,10,3
1,"['The', 'best', 'soundtrack', 'ever', 'to', 'a...",__label__2,801,97,8.173469,306,7,3
2,"['Amazing!:', 'This', 'soundtrack', 'is', 'my'...",__label__2,1148,129,8.830769,428,24,4
3,"['Excellent', 'Soundtrack:', 'I', 'truly', 'li...",__label__2,1098,118,9.226891,388,52,4
4,"['Remember,', 'Pull', 'Your', 'Jaw', 'Off', 'T...",__label__2,743,87,8.443182,284,30,0


In [42]:
# implement PoS (part of speech) tagging

pos_family = {
    'noun' : ['NN', 'NNS', 'NNP', 'NNPS'],
    'pron' : ['PRP', 'PRPS$', 'WP', 'WP$'],
    'verb' : ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
    'adj' : ['JJ', 'JJR', 'JJS'],
    'adv' : ['RB', 'RBR', 'RBS', 'WRB']
}

# function to check and get the PoS tag count of a words in a given sentences
def check_pos_tag(x, flag):
    count = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                count += 1
    
    except:
        pass
    
    return count

In [44]:
trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

In [45]:
trainDF.head()

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,noun_count,verb_count,adj_count,adv_count,pron_count
0,"['Stuning', 'even', 'for', 'the', 'non-gamer:'...",__label__2,667,80,8.234568,252,10,3,47,4,2,1,1
1,"['The', 'best', 'soundtrack', 'ever', 'to', 'a...",__label__2,801,97,8.173469,306,7,3,66,9,3,2,6
2,"['Amazing!:', 'This', 'soundtrack', 'is', 'my'...",__label__2,1148,129,8.830769,428,24,4,80,7,2,7,4
3,"['Excellent', 'Soundtrack:', 'I', 'truly', 'li...",__label__2,1098,118,9.226891,388,52,4,65,6,8,0,2
4,"['Remember,', 'Pull', 'Your', 'Jaw', 'Off', 'T...",__label__2,743,87,8.443182,284,30,0,57,5,6,0,3


#### 5. Topic Models

In [51]:
# train a LDA model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_
vocab = count_vect.get_feature_names()

In [52]:
# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpypy.array(vocab)[numpy.argsortopic_wordopic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

## Machine Learning Model Building

In [53]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
        
    return metrics.accuracy_score(predictions, valid_y)

#### 1. Naive Bayes (NB)

In [131]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF-IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF Vectors: ", accuracy)

# Naive Bayes on Ngram Level TF-IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF-IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.8348
NB, WordLevel TF-IDF:  0.8468
NB, N-Gram Vectors:  0.844
NB, CharLevel Vectors:  0.812


#### 2 Linear Classifier (LC)

In [132]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF-IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF Vectors: ", accuracy)

# Naive Bayes on Ngram Level TF-IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF-IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)

LR, Count Vectors:  0.868
LR, WordLevel TF-IDF:  0.8784
LR, N-Gram Vectors:  0.846
LR, CharLevel Vectors:  0.824


#### 3. Support Vector Machine (SVM)

In [138]:
# SVM on Count Vectors
accuracy = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)
print("SVM, Count Vectors: ", accuracy)

# SVM on Word Level TF-IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print("SVM, WordLevel TF-IDF Vectors: ", accuracy)

# SVM on Ngram Level TF-IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy)

# SVM on Character Level TF-IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("SVM, CharLevel Vectors: ", accuracy)

SVM, Count Vectors:  0.6944
SVM, WordLevel TF-IDF:  0.5132
SVM, N-Gram Vectors:  0.5132
SVM, CharLevel Vectors:  0.5132


#### 4. Bagging Model (RF)

In [139]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print("RF, Count Vectors: ", accuracy)

# RF on Word Level TF-IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF Vectors: ", accuracy)

# RF on Ngram Level TF-IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("RF, N-Gram Vectors: ", accuracy)

# RF on Character Level TF-IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("RF, CharLevel Vectors: ", accuracy)

RF, Count Vectors:  0.7476
RF, WordLevel TF-IDF:  0.7672
RF, N-Gram Vectors:  0.7592
RF, CharLevel Vectors:  0.69


#### 5. Boosting Model (XGB)

In [140]:
# Extreme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print("XGB, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF-IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print("XGB, WordLevel TF-IDF Vectors: ", accuracy)

# Extereme Gradient Boosting on Ngram Level TF-IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("XGB, N-Gram Vectors: ", accuracy)

# Extereme Gradient Boosting on Character Level TF-IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print("XGB, CharLevel Vectors: ", accuracy)

  if diff:


XGB, Count Vectors:  0.788


  if diff:


XGB, WordLevel TF-IDF:  0.7916


  if diff:


XGB, N-Gram Vectors:  0.7404
XGB, CharLevel Vectors:  0.8108


  if diff:


## Neural Network Model Learning

#### 1. Shallow Neural Networks

In [145]:
def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer = optimizers.Adam(), loss = 'binary_crossentropy', ep)
    
    return classifier

In [160]:
# Shallow Neural Network on Count Vectors
snn_classifier = create_model_architecture(xtrain_count.shape[1])
accuracy = train_model(snn_classifier, xtrain_count, train_y, xvalid_count, is_neural_net=True)
print("SNN, Count Vectors: ", accuracy)

# Shallow Neural Network on Word Level TF-IDF Vectors
snn_classifier = create_model_architecture(xtrain_tfidf.shape[1])
accuracy = train_model(snn_classifier, xtrain_tfidf, train_y, xvalid_tfidf, is_neural_net=True)
print("SNN, Word Level TF IDF Vectors",  accuracy)

# Shallow Neural Network on Ngram Level TF-IDF Vectors
snn_classifier = create_model_architecture(xtrain_tfidf_ngram.shape[1])
accuracy = train_model(snn_classifier, xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, is_neural_net=True)
print("SNN, Ngram Level TF IDF Vectors",  accuracy)

# Shallow Neural Network on Character Level TF-IDF Vectors
snn_classifier = create_model_architecture(xtrain_tfidf_ngram_chars.shape[1])
accuracy = train_model(snn_classifier, xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars, is_neural_net=True)
print("SNN, CharLevel Vectors: ", accuracy)

Epoch 1/1
SNN, Count Vectors:  0.5132
Epoch 1/1
SNN, Word Level TF IDF Vectors 0.5132
Epoch 1/1
SNN, Ngram Level TF IDF Vectors 0.5132
Epoch 1/1
SNN, CharLevel Vectors:  0.5132


#### 2. Deep Neural Networks

##### Convolutional Neural Networks

In [161]:
def create_cnn():
    pass

##### Recurrent Neural Network – LSTM

In [162]:
def create_rnn_lstm():
    pass

##### Recurrent Neural Network – GRU

In [163]:
def create_rnn_gru():
    pass

##### Bidirectional RNN

In [164]:
def create_bidirectional_rnn():
    pass

##### Recurrent Convolutional Neural Network

In [165]:
def create_rcnn():
    pass

<hr/>