In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys, io, os, errno, fileinput, csv
import collections as cl
from os import path

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn import metrics
trainset_file = "./datasets/train_set.csv"
train_df = pd.read_csv(trainset_file,  sep='\t')
EvaluationMetric_10fold_csv = "./datasets/EvaluationMetric_10fold.csv"
# train_df = train_df.sample(frac=1) # SUFFLE

from sklearn.model_selection import cross_val_score


count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()


X_train_counts = count_vect.fit_transform(train_df['Content'])
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

EvaluationMetric_10fold = []

__Training - Testing Functions__

In [5]:
def ten_fold_cross_validation(model, model_name):
    """ten fold cross validation using the input model"""
    
    # scores = cross_val_score(model, train_df['Content'], train_df['Category'], cv=10, scoring='recall_macro')
    fold_size = int (np.size(train_df,0) / 10 ) 
    #result = []
    result = 0
    for i in range(10):
        
        from_ = i*fold_size
        to_ = (i+1)*fold_size
        if i == 9:
            to_ = 12266

        test_X = train_df['Content'][from_:to_] # Validation Set
        test_Y = train_df['Category'][from_:to_]
        train_set = train_df.drop(train_df.index[from_:to_])  # train_set = np.delete(train_df, np.s_[from_:to_], 0)
        train_X = train_set['Content']
        train_Y = train_set['Category']

        # print('Iteration',i,': ', from_, to_)
        _ = model.fit(train_X, train_Y)
        predicted = model.predict(test_X)
        # result.append([model_name,'Accuracy ', np.mean(predicted == test_Y), '\n',metrics.classification_report(test_Y, predicted, target_names=['Business','Film','Football','Politics','Technology'])])
        result += np.mean(predicted == test_Y)
    res = '<10fold> Accuracy Score for ' + model_name + ' is ' + str(result/10)
    return res

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def ten_fold_cross_validation_metrics(model, model_name):
    """ten fold cross validation using the input model using sklearn metrics"""
    
    fold_size = int (np.size(train_df,0) / 10 ) 
    result = []
    
    acc = 0
    precision = 0
    recall = 0
    f1_sc = 0
    
    for i in range(10):
        
        from_ = i*fold_size
        to_ = (i+1)*fold_size
        if i == 9:
            to_ = 12266

        test_X = train_df['Content'][from_:to_] # Validation Set
        test_Y = train_df['Category'][from_:to_]
        train_set = train_df.drop(train_df.index[from_:to_])  # train_set = np.delete(train_df, np.s_[from_:to_], 0)
        train_X = train_set['Content']
        train_Y = train_set['Category']
        _ = model.fit(train_X, train_Y)
        predicted = model.predict(test_X)
        
        precision += precision_score(test_Y, predicted, average='macro')
        recall += recall_score(test_Y, predicted, average='macro')
        acc += np.mean(predicted == test_Y)
        f1_sc += f1_score(test_Y, predicted, average='macro')
        print('10fcv - Iteration',str(i+1))
    result.append([acc/10, recall/10, precision/10, f1_sc/10])
        
    return result



def simple_split(model, model_name):
    """ 0.33 split for test data """
    
    train_X = train_df['Content'][0:8177]
    train_Y = train_df['Category'][0:8177]

    test_X = train_df['Content'][8177:12266]
    test_Y = train_df['Category'][8177:12266]
    
    _ = model.fit(train_X, train_Y)
    predicted = model.predict(test_X)
    result = np.mean(predicted == test_Y)
    res = '<split> Accuracy Score for ' + model_name + ' is ' + str(np.mean(predicted == test_Y))
    return res


__Linear SVM__ with __Bag of Words__ and Tf-Idf transformation on features.

In [6]:
from sklearn import svm
from sklearn.feature_extraction.text import HashingVectorizer

""" 
    kernel: linear
    misclassification penalty factor C: 1.1
    Multiple Class Approach decision function: ovo (one versus one ) || ovr (one versus rest)
    1) linear kernel
    2) Radial Basis Function (RBF) kernel: set gamma value (e.g. 2^-4)
"""

svm_bow = Pipeline([
    # ('hash_vect', HashingVectorizer()),
    # ('tfidf', TfidfTransformer()),
    ('vect', CountVectorizer(analyzer ='word', lowercase=True)),
    ('svm', svm.SVC(kernel='linear', C=1.0, decision_function_shape='ovr')),])


# print(simple_split(svm_bow, 'Linear SVM with BoW'))

svm_bow_res = ten_fold_cross_validation_metrics(svm_bow,'Linear SVM with BoW')
EvaluationMetric_10fold.append([svm_bow_res[0][0], svm_bow_res[0][2], svm_bow_res[0][1], svm_bow_res[0][3]]) # append result to the list
print("Linear SVM with BoW:", "\n Accuracy: ", svm_bow_res[0][0], "\n Recall: ", svm_bow_res[0][1], "\n Precision: ", svm_bow_res[0][2], "\n F1-score: ", svm_bow_res[0][3])

10fcv - Iteration 1
10fcv - Iteration 2
10fcv - Iteration 3
10fcv - Iteration 4
10fcv - Iteration 5
10fcv - Iteration 6
10fcv - Iteration 7
10fcv - Iteration 8
10fcv - Iteration 9
10fcv - Iteration 10
Linear SVM with BoW: 
 Accuracy:  0.9520665610898073 
 Recall:  0.948442896140975 
 Precision:  0.9498583181168814 
 F1-score:  0.9489486170952942


__Random Forest__ with __Bag of Words__ and Tf-Idf transformation on features.

In [7]:
from sklearn.ensemble import RandomForestClassifier

""" 
    number of decision trees: 240
    max_depth in tree: 30
"""
random_forest_bow = Pipeline([
    ('vect', CountVectorizer(analyzer ='word', lowercase=True)),
    ('random_forest', RandomForestClassifier(n_estimators=240, max_depth=30, random_state=0)),])


# print(ten_fold_cross_validation(text_random_forest, 'Random Forest with BoW'))

random_forest_bow_res = ten_fold_cross_validation_metrics(random_forest_bow,'Random Forest with BoW')
EvaluationMetric_10fold.append([random_forest_bow_res[0][0], random_forest_bow_res[0][2], random_forest_bow_res[0][1], random_forest_bow_res[0][3]])

print("Random Forest with BoW:", "\n Accuracy: ", random_forest_bow_res[0][0], "\n Recall: ", random_forest_bow_res[0][1], "\n Precision: ", random_forest_bow_res[0][2], "\n F1-score: ", random_forest_bow_res[0][3])

10fcv - Iteration 1
10fcv - Iteration 2
10fcv - Iteration 3
10fcv - Iteration 4
10fcv - Iteration 5
10fcv - Iteration 6
10fcv - Iteration 7
10fcv - Iteration 8
10fcv - Iteration 9
10fcv - Iteration 10
Random Forest with BoW: 
 Accuracy:  0.9439095570009111 
 Recall:  0.9319728706772252 
 Precision:  0.9439764026613858 
 F1-score:  0.9365171085309854


__Compute SVD Output Vector Size for Variance > 90%__

In [8]:
from sklearn.decomposition import TruncatedSVD


def find_svd_feature_size(var_ratio):
   
    total_variance = 0.0  # Set initial variance explained so far
    
    n_components = 0 # Set initial number of features
    
    # For the explained variance of each feature:
    for explained_variance in var_ratio:
        
        total_variance += explained_variance # Add the explained variance to the total
        n_components += 1  # Add one to the number of components

        # If variance >= 90%
        if total_variance >= 0.9:
            break
            
    return n_components

"""
Start by using SVD on the term vector (size 85747), with m_components 2000 (too many),
in order to find the n_components value in order to have variance >= 90%

"""


# tsvd = TruncatedSVD(n_components=2000)
# X_tsvd = tsvd.fit(X_train_counts)

# tsvd_var_ratios = tsvd.explained_variance_ratio_ # List of explained variances
# component_num = find_svd_feature_size(tsvd_var_ratios)

component_num = 593 # TEMP

__Linear SVM__ using __SVD__

In [9]:
svm_svd = Pipeline([
    ('vect', CountVectorizer(analyzer='word')),
    ('svd', TruncatedSVD(n_components=component_num)),
    ('svm', svm.SVC(kernel='linear', C=1.0, decision_function_shape='ovr')),])


# print(simple_split(svm_svd, 'Linear SVM with SVD'))
svm_svd_res = ten_fold_cross_validation_metrics(svm_svd,'Linear SVM with SVD')

EvaluationMetric_10fold.append([svm_svd_res[0][0], svm_svd_res[0][2], svm_svd_res[0][1], svm_svd_res[0][3]])

print("Linear SVM with SVD:", "\n Accuracy: ", svm_svd_res[0][0], "\n Recall: ", svm_svd_res[0][1], "\n Precision: ", svm_svd_res[0][2], "\n F1-score: ", svm_svd_res[0][3])

10fcv - Iteration 1
10fcv - Iteration 2
10fcv - Iteration 3
10fcv - Iteration 4
10fcv - Iteration 5
10fcv - Iteration 6
10fcv - Iteration 7
10fcv - Iteration 8
10fcv - Iteration 9
10fcv - Iteration 10
Linear SVM with SVD: 
 Accuracy:  0.9429311614160717 
 Recall:  0.9376118460559901 
 Precision:  0.9408134414645571 
 F1-score:  0.9390072441882031


__Random Forest__ using __SVD__

In [10]:
from sklearn.ensemble import RandomForestClassifier


random_forest_svd = Pipeline([
    ('vect', CountVectorizer()),
    ('svd', TruncatedSVD(n_components=component_num)),
    ('random_forest', RandomForestClassifier(n_estimators=240, max_depth=30, random_state=0)),])


# print(simple_split(random_forest_bow, 'Random Forest with SVD'))

random_forest_svd_res = ten_fold_cross_validation_metrics(random_forest_svd,'Random Forest with SVD')

EvaluationMetric_10fold.append([random_forest_svd_res[0][0], random_forest_svd_res[0][2], random_forest_svd_res[0][1], random_forest_svd_res[0][3]])
print("Random Forest with SVD:", "\n Accuracy: ", random_forest_svd_res[0][0], "\n Recall: ", random_forest_svd_res[0][1], "\n Precision: ", random_forest_svd_res[0][2], "\n F1-score: ", random_forest_svd_res[0][3])

10fcv - Iteration 1
10fcv - Iteration 2
10fcv - Iteration 3
10fcv - Iteration 4
10fcv - Iteration 5
10fcv - Iteration 6
10fcv - Iteration 7
10fcv - Iteration 8
10fcv - Iteration 9
10fcv - Iteration 10
Random Forest with SVD: 
 Accuracy:  0.8663811412893793 
 Recall:  0.8389792489473311 
 Precision:  0.8795860019576922 
 F1-score:  0.8490384492737124


__W2V (Doc -> Sentence -> Word) TESTING**__

In [12]:
import nltk.data
import re
# nltk.download('punkt')
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
from gensim.models import Word2Vec
# from gensim.models import word2vec
from gensim.models import Phrases


def sentence_to_wordlist(sentence, remove_stopwords=False):
    
    sentence_text = re.sub(r'[^\w\s]','', sentence) # Remove non-letters
    words = sentence_text.lower().split()  # Convert words to lower case and split them
    return(words) # Return a list of words


def doc_to_sentences(doc, tokenizer, remove_stopwords=False ):
    try:
        
        raw_sentences = tokenizer.tokenize(doc.strip()) # NLTK tokenizer: split the text into sentences

        sentences = [] 
        for raw_sentence in raw_sentences:         # Loop over each sentence
            
            if len(raw_sentence) > 0: # sentence_to_wordlist to get a list of words
                sentences.append(sentence_to_wordlist(raw_sentence))
                
        len(sentences)  # Return the list of sentences (each sentence is a list of words, so this returns a list of lists)
        return sentences
    except:
        print('nope')


doc_list = train_df['Content'].tolist()
sentences = []

for i in range(0,len(doc_list)):
    try:
        # Need to first change "./." to "." so that sentences parse correctly
        # oped = doc_list[i].replace("/.", '')
        # Now apply functions
        sentences += doc_to_sentences(doc_list[i], tokenizer)
    except:
        pass

        
print("There are " + str(len(sentences)) + " sentences in our corpus.")

num_features = 300    # Word vector dimensionality                      
min_word_count = 50   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 6           # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

w2v_model = Word2Vec(sentences, workers=num_workers,size=num_features, min_count = min_word_count, 
                          window = context, sample = downsampling)


w2v = dict(zip(w2v_model.wv.index2word, w2v_model.wv.syn0))


w2v_model.most_similar('youtube',  topn=10)

# w2v_model['google']

There are 379003 sentences in our corpus.




[('videos', 0.7882595062255859),
 ('channels', 0.7623605728149414),
 ('spotify', 0.7340512871742249),
 ('facebook', 0.7319403886795044),
 ('video', 0.7314403057098389),
 ('minecraft', 0.7295858263969421),
 ('content', 0.7102934718132019),
 ('ads', 0.7091004848480225),
 ('twitch', 0.7027051448822021),
 ('adverts', 0.6964372992515564)]

__W2V Functions: Vectorizer__

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from gensim.models import Word2Vec
import nltk

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.items())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(word2vec.items())))

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = cl.defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])
    
num_features = 400      # Word vector dimensionality     300                 
min_word_count = 140  # Minimum word count            300            
num_workers = 4        # Number of threads to run in parallel
context = 10            # Context window size                       18                                                              
downsampling = 1e-3    # Downsample setting for frequent words

# train_df['tokenized_sents'] = train_df.apply(lambda row: nltk.word_tokenize(row['Content']), axis=1)
# print('Dataframe Content Tokenized')
# train_df['tokenized_content'] = train_df.apply(lambda row: re.sub(r'[^\w\s]','', row['Content']).lower(), axis=1)

model = Word2Vec(train_df['Content'] , 
                 workers=num_workers,size=num_features, min_count = min_word_count, 
                window = context, sample = downsampling)

w2v = dict(zip(model.wv.index2word, model.wv.syn0))



__Linear SVM__ using __W2V__

In [12]:
w2v_svm = Pipeline([
    # ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ('svm', svm.SVC(kernel='linear', C=1.0, decision_function_shape='ovr')),])
    # ("extra trees", ExtraTreesClassifier(n_estimators=200))])

# train_df['tokenized_content'] = train_df.apply(lambda row: re.sub(r'[^\w\s]','', row['Content'].lower()), axis=1)
# print(simple_split(w2v_svm, 'SVM with W2V'))
w2v_svm_res = ten_fold_cross_validation_metrics(w2v_svm,'SVM with W2V and TF-idf')
EvaluationMetric_10fold.append([w2v_svm_res[0][0], w2v_svm_res[0][2], w2v_svm_res[0][1], w2v_svm_res[0][3]])

print("SVM with W2V and TF-idf:", "\n Accuracy: ", w2v_svm_res[0][0], "\n Recall: ", w2v_svm_res[0][1], "\n Precision: ", w2v_svm_res[0][2], "\n F1-score: ", w2v_svm_res[0][3])

10fcv - Iteration 1
10fcv - Iteration 2
10fcv - Iteration 3
10fcv - Iteration 4
10fcv - Iteration 5
10fcv - Iteration 6
10fcv - Iteration 7
10fcv - Iteration 8
10fcv - Iteration 9
10fcv - Iteration 10
SVM with W2V and TF-idf: 
 Accuracy:  0.7149016969979449 
 Recall:  0.6748310471970421 
 Precision:  0.7318328257148756 
 F1-score:  0.6763408779272104


__Random Forest__ using __W2V__

In [13]:
w2v_rf = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ('random_forest', RandomForestClassifier(n_estimators=240, max_depth=30, random_state=0)),])

#print(simple_split(w2v_rf, 'Random Forest with W2V'))
w2v_rf_res = ten_fold_cross_validation_metrics(w2v_rf,'Random Forest with W2V and TF-idf')
EvaluationMetric_10fold.append([w2v_rf_res[0][0], w2v_rf_res[0][2], w2v_rf_res[0][1], w2v_rf_res[0][3]])

print("Random Forest with W2V and TF-idf:", "\n Accuracy: ", w2v_rf_res[0][0], "\n Recall: ", w2v_rf_res[0][1], "\n Precision: ", w2v_rf_res[0][2], "\n F1-score: ", w2v_rf_res[0][3])

10fcv - Iteration 1
10fcv - Iteration 2
10fcv - Iteration 3
10fcv - Iteration 4
10fcv - Iteration 5
10fcv - Iteration 6
10fcv - Iteration 7
10fcv - Iteration 8
10fcv - Iteration 9
10fcv - Iteration 10
Random Forest with W2V and TF-idf: 
 Accuracy:  0.6989226923158407 
 Recall:  0.6639647747872309 
 Precision:  0.7086530108451397 
 F1-score:  0.6685059562774113


__Beat the Benchmark__

In [14]:
"""
Linear SVM with Stohastic Gradient Decent with Bag of Words and Tf-Idf transformation on features.
"""
# train_df['tokenized_content'] = train_df.apply(lambda row: row['Content'].lower(), axis=1)


text_clf_svm = Pipeline([
    ('vect', CountVectorizer(analyzer='word', stop_words='english', lowercase=True)),
    ('tfidf', TfidfTransformer()),
    ('svm', svm.SVC(kernel='linear', C=0.92, decision_function_shape='ovr')),])


text_clf_svm_res = ten_fold_cross_validation_metrics(text_clf_svm,'Beat the BenchMark')
EvaluationMetric_10fold.append([text_clf_svm_res[0][0], text_clf_svm_res[0][2], text_clf_svm_res[0][1], text_clf_svm_res[0][3]])

print("Beat the BenchMark:", "\n Accuracy: ", text_clf_svm_res[0][0], "\n Recall: ", text_clf_svm_res[0][1], "\n Precision: ", text_clf_svm_res[0][2], "\n F1-score: ", text_clf_svm_res[0][3])

# res = ten_fold_cross_validation(text_clf_svm, 'Beat The BenchMark')
# simple_split(text_clf_svm, 'Beat The BenchMark')
# print(res) 0.9704084128148691

10fcv - Iteration 1
10fcv - Iteration 2
10fcv - Iteration 3
10fcv - Iteration 4
10fcv - Iteration 5
10fcv - Iteration 6
10fcv - Iteration 7
10fcv - Iteration 8
10fcv - Iteration 9
10fcv - Iteration 10
Beat the BenchMark: 
 Accuracy:  0.9698380330925191 
 Recall:  0.9683197665099137 
 Precision:  0.9678446084648099 
 F1-score:  0.9679768824057122


In [16]:
"""APPEND RESULTS OF THE REQUESTED MODELS TO CSV"""


try:
    os.remove(EvaluationMetric_10fold_csv) # remove if exists
except OSError:
    pass


with open(EvaluationMetric_10fold_csv, mode='w') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=['Statistic Measure', 'SVM (BOW)', 'Random Forest (BOW)','SVM (SVD)','Random Forest (SVD)','SVM (W2V)','Random Forest (W2V)', 'My method'], delimiter='\t')
    writer.writeheader()  
    writer.writerow({'Statistic Measure': 'Accuracy', 'SVM (BOW)': EvaluationMetric_10fold[0][0], 'Random Forest (BOW)': EvaluationMetric_10fold[1][0],'SVM (SVD)': EvaluationMetric_10fold[2][0],'Random Forest (SVD)': EvaluationMetric_10fold[3][0],'SVM (W2V)': EvaluationMetric_10fold[4][0],'Random Forest (W2V)': EvaluationMetric_10fold[5][0], 'My method': EvaluationMetric_10fold[6][0]})
    writer.writerow({'Statistic Measure': 'Precision', 'SVM (BOW)': EvaluationMetric_10fold[0][1], 'Random Forest (BOW)': EvaluationMetric_10fold[1][1],'SVM (SVD)': EvaluationMetric_10fold[2][1],'Random Forest (SVD)': EvaluationMetric_10fold[3][1],'SVM (W2V)': EvaluationMetric_10fold[4][1],'Random Forest (W2V)': EvaluationMetric_10fold[5][1], 'My method': EvaluationMetric_10fold[6][1]})
    writer.writerow({'Statistic Measure': 'Recall', 'SVM (BOW)': EvaluationMetric_10fold[0][2], 'Random Forest (BOW)': EvaluationMetric_10fold[1][2],'SVM (SVD)': EvaluationMetric_10fold[2][2],'Random Forest (SVD)': EvaluationMetric_10fold[3][2],'SVM (W2V)': EvaluationMetric_10fold[4][2],'Random Forest (W2V)': EvaluationMetric_10fold[5][2], 'My method': EvaluationMetric_10fold[6][2]})
    writer.writerow({'Statistic Measure': 'F-Measure', 'SVM (BOW)': EvaluationMetric_10fold[0][3], 'Random Forest (BOW)': EvaluationMetric_10fold[1][3],'SVM (SVD)': EvaluationMetric_10fold[2][3],'Random Forest (SVD)': EvaluationMetric_10fold[3][3],'SVM (W2V)': EvaluationMetric_10fold[4][3],'Random Forest (W2V)': EvaluationMetric_10fold[5][3], 'My method': EvaluationMetric_10fold[6][3]})


__Other experimental ways__

__Naive Bayes__ with __BoW__

In [6]:
from sklearn.naive_bayes import MultinomialNB

nb = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('naive_bayes', MultinomialNB(fit_prior=False)),])


print(simple_split(nb, 'Naive Bayes with BoW'))

<split> Accuracy Score for Naive Bayes with BoW is 0.9430178527757398


In [21]:
from sklearn.neighbors import KNeighborsClassifier

"""
K Neighrest Neighbors

number of neighbors: 13
"""
knn = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('knn', KNeighborsClassifier(n_neighbors=13)),])

print(simple_split(knn, 'KNN with BoW'))

<split> Accuracy Score for KNN with BoW is 0.9505991685008559


In [25]:
from sklearn.neural_network import MLPClassifier

"""
MLP Classifier

with Stohastic Gradient Decent
RELU Activation function

alpha: L2 penalty

"""

lr_ = 0.0003 # learning rate
l2_ = 0.0001 # l2 penalty

mlp = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('mlp', MLPClassifier(hidden_layer_sizes=(50,50,50 ), activation='relu', solver='adam', alpha=l2_, batch_size='auto', learning_rate='constant', learning_rate_init=lr_, max_iter=150)),])

print(simple_split(mlp, 'Neural Network - MLP with BoW'))

<split> Accuracy Score for Neural Network - MLP with BoW is 0.9662509170946442


__Quadratic Discriminant Analysis using SVD__

In [26]:
"""
Needs SVD because Matrix is too sparse
"""

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('svd', TruncatedSVD(n_components=component_num)),
    ('qda', QuadraticDiscriminantAnalysis()),])

print(simple_split(qda, 'Quadratic Discriminant Analysis with BoW'))

<split> Accuracy Score for Quadratic Discriminant Analysis with BoW is 0.9557348985081927


__Naive Bayes with Gaussian Multinomial Distribution__

In [28]:
from sklearn.naive_bayes import GaussianNB

nb = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('svd', TruncatedSVD(n_components=component_num)),
    ('g_naive_bayes', GaussianNB()),])

print(simple_split(nb, 'Gaussian Naive Bayes'))

<split> Accuracy Score for Gaussian Naive Bayes is 0.8625580826607973


In [24]:
from sklearn.ensemble import AdaBoostClassifier

abc = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('abc', AdaBoostClassifier(n_estimators=240)),])

print(simple_split(abc, 'AdaBoostClassifier with BoW'))

<split> Accuracy Score for AdaBoostClassifier with BoW is 0.8725849841036928


In [8]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                     ('tfidf', TfidfTransformer()),
                    ('mnb', MultinomialNB(fit_prior=False)),])

print(simple_split(text_mnb_stemmed, 'Naive Bayes with BoW'))

<split> Accuracy Score for Naive Bayes with BoW is 0.9684519442406456


__SVM with Stohastic Gradient Descent__

In [21]:
_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4, max_iter=120, 
                                                   tol=1e-3,random_state=42, n_iter_no_change= 40)),])

# print(simple_split(_clf_svm, 'Test'))
aaa = ten_fold_cross_validation_metrics(_clf_svm,'SVM with Gradient Decent')
print("SVM with Gradient Decent:", "\n Accuracy: ", aaa[0][0], "\n Recall: ", aaa[0][1], "\n Precision: ", aaa[0][2], "\n F1-score: ", aaa[0][3])


10fcv - Iteration 1
10fcv - Iteration 2
10fcv - Iteration 3
10fcv - Iteration 4
10fcv - Iteration 5
10fcv - Iteration 6
10fcv - Iteration 7
10fcv - Iteration 8
10fcv - Iteration 9
10fcv - Iteration 10
Beat the BenchMark: 
 Accuracy:  0.9700831285354126 
 Recall:  0.9681692044135328 
 Precision:  0.967944849932891 
 F1-score:  0.9679714069688066
