<div>
<img src=https://www.institutedata.com/wp-content/uploads/2019/10/iod_h_tp_primary_c.svg width="300">
</div>

# Lab 9.7: Text Classification
INSTRUCTIONS:
- Run the cells
- Observe and understand the results
- Answer the questions

## Import libraries

In [1]:
## Import Libraries
import numpy as np
import pandas as pd

import string
import spacy

from collections import Counter

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# import warnings
# warnings.filterwarnings('ignore')

## Load data

Sample:

    __label__2 Stuning even for the non-gamer: This sound ...
    __label__2 The best soundtrack ever to anything.: I'm ...
    __label__2 Amazing!: This soundtrack is my favorite m ...
    __label__2 Excellent Soundtrack: I truly like this so ...
    __label__2 Remember, Pull Your Jaw Off The Floor Afte ...
    __label__2 an absolute masterpiece: I am quite sure a ...
    __label__1 Buyer beware: This is a self-published boo ...
    . . .
    
There are only two **labels**:
- `__label__1`
- `__label__2`

In [2]:
## Loading the data

trainDF = pd.read_fwf(
    filepath_or_buffer = 'dat/corpus.txt',
    colspecs = [(9, 10),   # label: get only the numbers 1 or 2
                (11, 9000) # text: makes the it big enought to get to the end of the line
               ], 
    header = 0,
    names = ['label', 'text'],
    lineterminator = '\n'
)

# convert label from [1, 2] to [0, 1]
trainDF['label'] = trainDF['label'] - 1

## Inspect the data

In [3]:
# ANSWER
trainDF

Unnamed: 0,label,text
0,1,The best soundtrack ever to anything.: I'm rea...
1,1,Amazing!: This soundtrack is my favorite music...
2,1,Excellent Soundtrack: I truly like this soundt...
3,1,"Remember, Pull Your Jaw Off The Floor After He..."
4,1,an absolute masterpiece: I am quite sure any o...
...,...,...
9994,1,A revelation of life in small town America in ...
9995,1,Great biography of a very interesting journali...
9996,0,Interesting Subject; Poor Presentation: You'd ...
9997,0,Don't buy: The box looked used and it is obvio...


In [4]:
trainDF.isnull().sum()

label    0
text     0
dtype: int64

In [5]:
trainDF.duplicated().sum()

0

In [6]:
X = trainDF.text
X

0       The best soundtrack ever to anything.: I'm rea...
1       Amazing!: This soundtrack is my favorite music...
2       Excellent Soundtrack: I truly like this soundt...
3       Remember, Pull Your Jaw Off The Floor After He...
4       an absolute masterpiece: I am quite sure any o...
                              ...                        
9994    A revelation of life in small town America in ...
9995    Great biography of a very interesting journali...
9996    Interesting Subject; Poor Presentation: You'd ...
9997    Don't buy: The box looked used and it is obvio...
9998    Beautiful Pen and Fast Delivery.: The pen was ...
Name: text, Length: 9999, dtype: object

In [7]:
y = trainDF.label
y

0       1
1       1
2       1
3       1
4       1
       ..
9994    1
9995    1
9996    0
9997    0
9998    1
Name: label, Length: 9999, dtype: int64

## Split the data into train and test

In [8]:
## ANSWER
## split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Feature Engineering

### Count Vectors as features

In [9]:
# create a count vectorizer object
count_vect = CountVectorizer(token_pattern = r'\w{1,}')

# Learn a vocabulary dictionary of all tokens in the raw documents
count_vect.fit(trainDF['text'])

# Transform documents to document-term matrix.
X_train_count = count_vect.transform(X_train)
X_test_count = count_vect.transform(X_test)

X_train_count_df = pd.DataFrame(X_train_count.toarray().transpose(),
                   index=count_vect.get_feature_names())
X_test_count_df = pd.DataFrame(X_test_count.toarray().transpose(),
                   index=count_vect.get_feature_names())

In [10]:
X_test_count_df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
discomfiting,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
scales,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
roseamry,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dumfries,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF Vectors as features
- Word level
- N-Gram level
- Character level

In [11]:
%%time
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer = 'word',
                             token_pattern = r'\w{1,}',
                             max_features = 5000)
print(tfidf_vect)

tfidf_vect.fit(trainDF['text'])

# Transform documents to document-term matrix.
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf  = tfidf_vect.transform(X_test)

X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray().transpose(),
                   index=tfidf_vect.get_feature_names())
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray().transpose(),
                   index=tfidf_vect.get_feature_names())

TfidfVectorizer(max_features=5000, token_pattern='\\w{1,}')
Wall time: 1.16 s


In [12]:
X_train_tfidf_df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7989,7990,7991,7992,7993,7994,7995,7996,7997,7998
real,0.0,0.096286,0.0,0.0,0.0,0.0,0.0,0.0,0.113249,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
listened,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
jennifer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gba,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
structure,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
%%time
# ngram level tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer = 'word',
                                   token_pattern = r'\w{1,}',
                                   ngram_range = (2, 3),
                                   max_features = 5000)
print(tfidf_vect_ngram)

tfidf_vect_ngram.fit(trainDF['text'])

# Transform documents to document-term matrix.
X_train_tfidf_ngram = tfidf_vect_ngram.transform(X_train)
X_test_tfidf_ngram  = tfidf_vect_ngram.transform(X_test)

X_train_tfidf_ngram_df = pd.DataFrame(X_train_tfidf_ngram.toarray().transpose(),
                       index=tfidf_vect_ngram.get_feature_names())
X_test_tfidf_ngram_df = pd.DataFrame(X_test_tfidf_ngram.toarray().transpose(),
                       index=tfidf_vect_ngram.get_feature_names())

TfidfVectorizer(max_features=5000, ngram_range=(2, 3), token_pattern='\\w{1,}')
Wall time: 4.95 s


In [14]:
X_train_tfidf_ngram_df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7989,7990,7991,7992,7993,7994,7995,7996,7997,7998
just don,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the late,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s really,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
is horrible,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
on how,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
%%time
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer = 'char',
                                         ngram_range = (2, 3),
                                         max_features = 5000)
print(tfidf_vect_ngram_chars)

tfidf_vect_ngram_chars.fit(trainDF['text'])

# Transform documents to document-term matrix.
X_train_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_train)
X_test_tfidf_ngram_chars  = tfidf_vect_ngram_chars.transform(X_test)

X_train_tfidf_ngram_chars_df = pd.DataFrame(X_train_tfidf_ngram_chars.toarray().transpose(),
                           index=tfidf_vect_ngram_chars.get_feature_names())
X_test_tfidf_ngram_chars_df = pd.DataFrame(X_test_tfidf_ngram_chars.toarray().transpose(),
                           index=tfidf_vect_ngram_chars.get_feature_names())

TfidfVectorizer(analyzer='char', max_features=5000, ngram_range=(2, 3))
Wall time: 7.94 s


In [16]:
X_train_tfidf_ngram_chars_df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7989,7990,7991,7992,7993,7994,7995,7996,7997,7998
dva,0.037858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
av,0.02211,0.017399,0.010946,0.0,0.0,0.019159,0.0,0.016137,0.019316,0.0,...,0.0,0.031975,0.082098,0.011635,0.0,0.021926,0.0,0.021819,0.0,0.012795
"h,",0.025108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024777,0.0,0.0
bas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Text / NLP based features

Create some other features.

Char_Count = Number of Characters in Text

Word Count = Number of Words in Text

Word Density = Average Number of Char in Words

Punctuation Count = Number of Punctuation in Text

Title Word Count = Number of Words in Title

Uppercase Word Count = Number of Upperwords in Text

In [17]:
import regex as re

In [18]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
string.ascii_uppercase

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [20]:
trainDF['text'][0]

"The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny."

In [21]:
pattern = '\w+'

output = re.findall(pattern, trainDF['text'][0])
print(output)
print(len(output))
char_count_per_word = [len(t) for t in output]
word_density = np.average(char_count_per_word)
print(word_density)
no_punctuation = sum([trainDF['text'][0].count(p) for p in string.punctuation])
print(no_punctuation)
title = trainDF['text'][0].split(':')[0]
print(title)
upper_word_count = sum([trainDF['text'][0].count(u) for u in string.ascii_uppercase])
print(upper_word_count)

['The', 'best', 'soundtrack', 'ever', 'to', 'anything', 'I', 'm', 'reading', 'a', 'lot', 'of', 'reviews', 'saying', 'that', 'this', 'is', 'the', 'best', 'game', 'soundtrack', 'and', 'I', 'figured', 'that', 'I', 'd', 'write', 'a', 'review', 'to', 'disagree', 'a', 'bit', 'This', 'in', 'my', 'opinino', 'is', 'Yasunori', 'Mitsuda', 's', 'ultimate', 'masterpiece', 'The', 'music', 'is', 'timeless', 'and', 'I', 'm', 'been', 'listening', 'to', 'it', 'for', 'years', 'now', 'and', 'its', 'beauty', 'simply', 'refuses', 'to', 'fade', 'The', 'price', 'tag', 'on', 'this', 'is', 'pretty', 'staggering', 'I', 'must', 'say', 'but', 'if', 'you', 'are', 'going', 'to', 'buy', 'any', 'cd', 'for', 'this', 'much', 'money', 'this', 'is', 'the', 'only', 'one', 'that', 'I', 'feel', 'would', 'be', 'worth', 'every', 'penny']
102
3.911764705882353
14
The best soundtrack ever to anything.
12


In [22]:
def char_count(text):
    return len(text)
    
def word_count(text):
    pattern = '\w+'
    output = re.findall(pattern, text)
    return len(output)

def word_density(text):
    pattern = '\w+'
    output = re.findall(pattern, text)
    char_count_per_word = [len(t) for t in output]
    word_density = np.average(char_count_per_word)
    return word_density

def punctuation_count(text):
    no_punctuation = sum([text.count(p) for p in string.punctuation])
    return no_punctuation

def title_word_count(text):
    title = text.split(':')[0]
    return word_count(title)

def upper_word_count(text):
    upper_word_count = sum([text.count(u) for u in string.ascii_uppercase])
    return upper_word_count

In [23]:
%%time
# ANSWER
trainDF['char_count'] = trainDF['text'].apply(char_count)
trainDF['word_count'] = trainDF['text'].apply(word_count)
trainDF['word_density'] = trainDF['text'].apply(word_density)
trainDF['punctuation_count'] = trainDF['text'].apply(punctuation_count)
trainDF['title_word_count'] = trainDF['text'].apply(title_word_count)
trainDF['uppercase_word_count'] = trainDF['text'].apply(upper_word_count)

Wall time: 1.07 s


In [24]:
trainDF.head(5)

Unnamed: 0,label,text,char_count,word_count,word_density,punctuation_count,title_word_count,uppercase_word_count
0,1,The best soundtrack ever to anything.: I'm rea...,509,102,3.911765,14,6,12
1,1,Amazing!: This soundtrack is my favorite music...,760,136,4.352941,40,1,27
2,1,Excellent Soundtrack: I truly like this soundt...,743,122,4.860656,33,2,59
3,1,"Remember, Pull Your Jaw Off The Floor After He...",481,90,4.144444,22,10,31
4,1,an absolute masterpiece: I am quite sure any o...,825,149,4.362416,35,3,21


In [25]:
## load spaCy
nlp = spacy.load('en_core_web_lg')
# nlp = spacy.load("en_core_web_sm")

Part of Speech in **SpaCy**

    POS   DESCRIPTION               EXAMPLES
    ----- ------------------------- ---------------------------------------------
    ADJ   adjective                 big, old, green, incomprehensible, first
    ADP   adposition                in, to, during
    ADV   adverb                    very, tomorrow, down, where, there
    AUX   auxiliary                 is, has (done), will (do), should (do)
    CONJ  conjunction               and, or, but
    CCONJ coordinating conjunction  and, or, but
    DET   determiner                a, an, the
    INTJ  interjection              psst, ouch, bravo, hello
    NOUN  noun                      girl, cat, tree, air, beauty
    NUM   numeral                   1, 2017, one, seventy-seven, IV, MMXIV
    PART  particle                  's, not,
    PRON  pronoun                   I, you, he, she, myself, themselves, somebody
    PROPN proper noun               Mary, John, London, NATO, HBO
    PUNCT punctuation               ., (, ), ?
    SCONJ subordinating conjunction if, while, that
    SYM   symbol                    $, %, §, ©, +, −, ×, ÷, =, :), 😝
    VERB  verb                      run, runs, running, eat, ate, eating
    X     other                     sfpksdpsxmsa
    SPACE space
    
Find out number of Adjective, Adverb, Noun, Numeric, Pronoun, Proposition, Verb.

    Hint:
    1. Convert text to spacy document
    2. Use pos_
    3. Use Counter 

Most of the tags and labels look pretty abstract, and they vary between languages. spacy.explain will show you a short description – for example, spacy.explain("VBZ") returns “verb, 3rd person singular present”.

In [26]:
# Initialise some columns for feature's counts
trainDF['adj_count'] = 0
trainDF['adv_count'] = 0
trainDF['noun_count'] = 0
trainDF['num_count'] = 0
trainDF['pron_count'] = 0
trainDF['propn_count'] = 0
trainDF['verb_count'] = 0

In [27]:
# Rule-base matcher

# import spacy Matcher
from spacy.matcher import Matcher

# create a matcher
matcher = Matcher(nlp.vocab)

In [28]:
# ANSWER
def adj_count(text: str):
    pattern = [{'POS': 'ADJ'}]
    matcher.add('Words', [pattern])
    doc = nlp(text)
    words = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        words.append(span.text)
    return len(words)

def adv_count(text: str):
    pattern = [{'POS': 'ADV'}]
    matcher.add('Words', [pattern])
    doc = nlp(text)
    words = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        words.append(span.text)
    return len(words)

def noun_count(text: str):
    pattern = [{'POS': 'NOUN'}]
    matcher.add('Words', [pattern])
    doc = nlp(text)
    words = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        words.append(span.text)
    return len(words)

def num_count(text: str):
    pattern = [{'POS': 'NUM'}]
    matcher.add('Words', [pattern])
    doc = nlp(text)
    words = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        words.append(span.text)
    return len(words)

def pron_count(text: str):
    pattern = [{'POS': 'PRON'}]
    matcher.add('Words', [pattern])
    doc = nlp(text)
    words = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        words.append(span.text)
    return len(words)

def propn_count(text: str):
    pattern = [{'POS': 'PROPN'}]
    matcher.add('Words', [pattern])
    doc = nlp(text)
    words = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        words.append(span.text)
    return len(words)

def verb_count(text: str):
    pattern = [{'POS': 'VERB'}]
    matcher.add('Words', [pattern])
    doc = nlp(text)
    words = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        words.append(span.text)
    return len(words)

In [29]:
verb_count(trainDF['text'][0])

14

In [None]:
%%time
trainDF['adj_count'] = trainDF['text'].apply(adj_count)
trainDF['adv_count'] = trainDF['text'].apply(adv_count)
trainDF['noun_count'] = trainDF['text'].apply(noun_count)
trainDF['num_count'] = trainDF['text'].apply(num_count)
trainDF['pron_count'] = trainDF['text'].apply(pron_count)
trainDF['propn_count'] = trainDF['text'].apply(propn_count)
trainDF['verb_count'] = trainDF['text'].apply(verb_count)

In [None]:
cols = [
    'char_count', 'word_count', 'word_density',
    'punctuation_count', 'title_word_count',
    'uppercase_word_count', 'adj_count',
    'adv_count', 'noun_count', 'num_count',
    'pron_count', 'propn_count', 'verb_count']

trainDF[cols].sample(5)

### Topic Models as features

In [None]:
%%time
# train a LDA Model
lda_model = LatentDirichletAllocation(n_components = 20, learning_method = 'online', max_iter = 20)

X_topics = lda_model.fit_transform(X_train_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

In [None]:
# view the topic models
n_top_words = 10
topic_summaries = []
print('Group Top Words')
print('-----', '-'*80)
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    top_words = ' '.join(topic_words)
    topic_summaries.append(top_words)
    print('  %3d %s' % (i, top_words))

## Modelling

In [None]:
## helper function

def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)

    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)

    return accuracy_score(predictions, y_test)

In [None]:
# Keep the results in a dataframe
results = pd.DataFrame(columns = ['Count Vectors',
                                  'WordLevel TF-IDF',
                                  'N-Gram Vectors',
                                  'CharLevel Vectors'])

### Naive Bayes Classifier

In [None]:
%%time
# Naive Bayes on Count Vectors
accuracy1 = train_model(MultinomialNB(), X_train_count, y_train, X_test_count)
print('NB, Count Vectors    : %.4f\n' % accuracy1)

In [None]:
%%time
# Naive Bayes on Word Level TF IDF Vectors
accuracy2 = train_model(MultinomialNB(), X_train_tfidf, y_train, X_test_tfidf)
print('NB, WordLevel TF-IDF : %.4f\n' % accuracy2)

In [None]:
%%time
# Naive Bayes on Ngram Level TF IDF Vectors
accuracy3 = train_model(MultinomialNB(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print('NB, N-Gram Vectors   : %.4f\n' % accuracy3)

In [None]:
%%time
# # Naive Bayes on Character Level TF IDF Vectors
accuracy4 = train_model(MultinomialNB(), X_train_tfidf_ngram_chars, y_train, X_test_tfidf_ngram_chars)
print('NB, CharLevel Vectors: %.4f\n' % accuracy4)

In [None]:
results.loc['Naïve Bayes'] = {
    'Count Vectors': accuracy1,
    'WordLevel TF-IDF': accuracy2,
    'N-Gram Vectors': accuracy3,
    'CharLevel Vectors': accuracy4}

### Linear Classifier

In [None]:
%%time
# Linear Classifier on Count Vectors
accuracy1 = train_model(LogisticRegression(solver = 'lbfgs', max_iter = 350), X_train_count, y_train, X_test_count)
print('LR, Count Vectors    : %.4f\n' % accuracy1)

In [None]:
%%time
# Linear Classifier on Word Level TF IDF Vectors
accuracy2 = train_model(LogisticRegression(solver = 'lbfgs', max_iter = 100), X_train_tfidf, y_train, X_test_tfidf)
print('LR, WordLevel TF-IDF : %.4f\n' % accuracy2)

In [None]:
%%time
# Linear Classifier on Ngram Level TF IDF Vectors
accuracy3 = train_model(LogisticRegression(solver = 'lbfgs', max_iter = 100), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print('LR, N-Gram Vectors   : %.4f\n' % accuracy3)

In [None]:
%%time
# Linear Classifier on Character Level TF IDF Vectors
accuracy4 = train_model(LogisticRegression(solver = 'lbfgs', max_iter = 100), X_train_tfidf_ngram_chars, y_train, X_test_tfidf_ngram_chars)
print('LR, CharLevel Vectors: %.4f\n' % accuracy4)

In [None]:
results.loc['Logistic Regression'] = {
    'Count Vectors': accuracy1,
    'WordLevel TF-IDF': accuracy2,
    'N-Gram Vectors': accuracy3,
    'CharLevel Vectors': accuracy4}

### Support Vector Machine

In [None]:
%%time
# Support Vector Machine on Count Vectors
accuracy1 = train_model(LinearSVC(), X_train_count, y_train, X_test_count)
print('SVM, Count Vectors    : %.4f\n' % accuracy1)

In [None]:
%%time
# Support Vector Machine on Word Level TF IDF Vectors
accuracy2 = train_model(LinearSVC(), X_train_tfidf, y_train, X_test_tfidf)
print('SVM, WordLevel TF-IDF : %.4f\n' % accuracy2)

In [None]:
%%time
# Support Vector Machine on Ngram Level TF IDF Vectors
accuracy3 = train_model(LinearSVC(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print('SVM, N-Gram Vectors   : %.4f\n' % accuracy3)

In [None]:
%%time
# Support Vector Machine on Character Level TF IDF Vectors
accuracy4 = train_model(LinearSVC(), X_train_tfidf_ngram_chars, y_train, X_test_tfidf_ngram_chars)
print('SVM, CharLevel Vectors: %.4f\n' % accuracy4)

In [None]:
results.loc['Support Vector Machine'] = {
    'Count Vectors': accuracy1,
    'WordLevel TF-IDF': accuracy2,
    'N-Gram Vectors': accuracy3,
    'CharLevel Vectors': accuracy4}

### Bagging Models

In [None]:
%%time
# Bagging (Random Forest) on Count Vectors
accuracy1 = train_model(RandomForestClassifier(n_estimators = 100), X_train_count, y_train, X_test_count)
print('RF, Count Vectors    : %.4f\n' % accuracy1)

In [None]:
%%time
# Bagging (Random Forest) on Word Level TF IDF Vectors
accuracy2 = train_model(RandomForestClassifier(n_estimators = 100), X_train_tfidf, y_train, X_test_tfidf)
print('RF, WordLevel TF-IDF : %.4f\n' % accuracy2)

In [None]:
%%time
# Bagging (Random Forest) on Ngram Level TF IDF Vectors
accuracy3 = train_model(RandomForestClassifier(n_estimators = 100), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print('RF, N-Gram Vectors   : %.4f\n' % accuracy3)

In [None]:
%%time
# Bagging (Random Forest) on Character Level TF IDF Vectors
accuracy4 = train_model(RandomForestClassifier(n_estimators = 100), X_train_tfidf_ngram_chars, y_train, X_test_tfidf_ngram_chars)
print('RF, CharLevel Vectors: %.4f\n' % accuracy4)

In [None]:
results.loc['Random Forest'] = {
    'Count Vectors': accuracy1,
    'WordLevel TF-IDF': accuracy2,
    'N-Gram Vectors': accuracy3,
    'CharLevel Vectors': accuracy4}

### Boosting Models

In [None]:
%%time
# Gradient Boosting on Count Vectors
accuracy1 = train_model(GradientBoostingClassifier(), X_train_count, y_train, X_test_count)
print('GB, Count Vectors    : %.4f\n' % accuracy1)

In [None]:
%%time
# Gradient Boosting on Word Level TF IDF Vectors
accuracy2 = train_model(GradientBoostingClassifier(), X_train_tfidf, y_train, X_test_tfidf)
print('GB, WordLevel TF-IDF : %.4f\n' % accuracy2)

In [None]:
%%time
# Gradient Boosting on Ngram Level TF IDF Vectors
accuracy3 = train_model(GradientBoostingClassifier(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print('GB, N-Gram Vectors   : %.4f\n' % accuracy3)

In [None]:
%%time
# Gradient Boosting on Character Level TF IDF Vectors
accuracy4 = train_model(GradientBoostingClassifier(), X_train_tfidf_ngram_chars, y_train, X_test_tfidf_ngram_chars)
print('GB, CharLevel Vectors: %.4f\n' % accuracy4)

In [None]:
results.loc['Gradient Boosting'] = {
    'Count Vectors': accuracy1,
    'WordLevel TF-IDF': accuracy2,
    'N-Gram Vectors': accuracy3,
    'CharLevel Vectors': accuracy4}

In [None]:
results

In [None]:
results.style.highlight_max(color = 'lightgreen', axis = 0)



---



---



> > > > > > > > > © 2021 Institute of Data


---



---



