# Supervised Machine Learning - Neutrality Classifier
   
In this notebook, the manual content analysis data is used to train and evaluate a classifier that assesses the neutrality of an article.   
The process includes data preperation, feature selection, and the evaluation and comparison of different types of classifiers.

## Load packages

In [53]:
#import relevant packages
import pandas as pd
from pandas import read_excel
import re
import numpy as np
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, ShuffleSplit, GridSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from pprint import pprint
import spacy
#import German language model
import de_core_news_md
#define nlp pipe
nlp = de_core_news_md.load()

## Read data

In [2]:
#read in the manually coded data
mca = read_excel("mca_data.xlsx")

#read in the article data
#first Dataset A
articles = read_excel("sample.xlsx")
#then Dataset B
articles_online = read_excel("sample_online_new.xlsx")
#then Dataset C
articles2 = read_excel("Dataset C.xlsx")
print(len(mca), len(articles), len(articles_online), len(articles2))

578 250 150 98


In [3]:
#add the two print dfs together
articles = articles.append(articles2)
len(articles)

348

In [4]:
articles

Unnamed: 0,ID,Newspaper,Date,Length,Headline,Article,Author
0,2471,Rheinische Post,2020-05-02,291 words,Die Demontage einer Ministerin,Maximilian Plück Armin Laschet brauchte am Don...,"Plück, Maximilian"
1,6803,Der Tagesspiegel,2020-05-19,418 words,Microsoft mahnt Berlin ab Der US-Konzern verla...,Gegenwind für die Berliner Beauftragte für Dat...,
2,779,Die Welt,2020-04-22,595 words,Bolsonaro verliert die Nerven; Brasiliens Staa...,"Jair Bolsonaro sieht müde aus, mehrfach hustet...",Tobias Käufer
3,3110,Stuttgarter Zeitung,2020-04-30,287 words,Anklage im Mordfall Lübcke,Die Bundesanwaltschaft hat Anklage im Fall des...,AFP
4,3266,Stuttgarter Zeitung,2020-04-25,198 words,Maskenpflicht auchin Schulen?; Bundesbildungsm...,Bundesbildungsministerin Anja Karliczek (CDU) ...,dpa
5,738,Die Welt,2020-04-22,931 words,Einsame Spitze; Das Aus der SAP-Chefin Morgan ...,"Doppelspitzen erfüllen nur selten ihren Zweck,...",Klaus Boldt
6,1122,Die Welt,2020-05-04,872 words,"EU-Solidarität, von allen für alle","Statt gemeinsam die Seuche zu bekämpfen, wurde...",Jürgen Rüttgers
7,5298,Süddeutsche Zeitung (inkl. Regionalausgaben),2020-05-14,611 words,Parkplatz-Kneipen erlaubt; Stadt kommt Wirten ...,"Ein Bier, wo bisher das Auto stand? Schnitzel ...",
8,4761,Süddeutsche Zeitung (inkl. Regionalausgaben),2020-05-08,705 words,Bürgerrechte gelten auch im Notstand; Grüne be...,Landkreis - Die Zuwächse der Grünen sind zulet...,IRIS HILBERTH
9,1538,Rheinische Post,2020-04-23,652 words,CDU fordert Rückbau von zwei Umweltspuren; Tes...,Hendrik Gaasterland Düsseldorf Die CDU-Fraktio...,"Gaasterland, Hendrik"


## Pre-processing

In [5]:
#selecting only the relevant columns
mca = mca[["CID", "AID", "NO", "CPA", "BOV", "BOA", "NEU"]]
#drop the first row, because it contains the column names
mca = mca.drop(mca.index[0])
#reset the index
mca = mca.reset_index()
#inspect the data
mca.head()

Unnamed: 0,index,CID,AID,NO,CPA,BOV,BOA,NEU
0,1,3,490,1,3,2,2,2
1,2,3,3414,3,1,1,1,2
2,3,3,6996,4,3,2,2,1
3,4,3,4894,5,3,2,2,1
4,5,3,3110,3,3,2,2,2


In [6]:
#exclude rows with empty values
mca = mca[mca["BOV"].notna()]
mca = mca[mca["AID"].notna()]
mca = mca[mca["CID"].notna()]
mca = mca[mca["NEU"].notna()]
#change the datatypes of the manually coded df
mca["AID"] = mca["AID"].astype(int)
mca["CID"] = mca["CID"].astype(int)
mca["NO"] = mca["NO"].astype(int)
mca["CPA"] = mca["CPA"].astype(int)
mca["BOV"] = mca["BOV"].astype(int)
mca["BOA"] = mca["BOA"].astype(int)
mca["NEU"] = mca["NEU"].astype(int)

In [7]:
articles = articles.rename(columns= {"ID":"AID"})
articles.head(3)

Unnamed: 0,AID,Newspaper,Date,Length,Headline,Article,Author
0,2471,Rheinische Post,2020-05-02,291 words,Die Demontage einer Ministerin,Maximilian Plück Armin Laschet brauchte am Don...,"Plück, Maximilian"
1,6803,Der Tagesspiegel,2020-05-19,418 words,Microsoft mahnt Berlin ab Der US-Konzern verla...,Gegenwind für die Berliner Beauftragte für Dat...,
2,779,Die Welt,2020-04-22,595 words,Bolsonaro verliert die Nerven; Brasiliens Staa...,"Jair Bolsonaro sieht müde aus, mehrfach hustet...",Tobias Käufer


In [8]:
len(articles)

348

## Merging the dataframes

In [9]:
#change the AID column dtype for the article df -> necessary for merging
articles["AID"] = articles["AID"].astype(int)
#merge the mca data with the article data
df = mca.merge(articles, how="left", on="AID")
#check for duplicates
duplicates = df[df.duplicated(["AID"])]
#check the length
len(df)

575

In [10]:
#rename columns
articles_online = articles_online.rename(columns={"ID":"AID"})
#select only the AID (to merge on) and the text
articles_online = articles_online[["AID", "Article", "Newspaper"]]
#merge the dataframe with the online article data
df = df.merge(articles_online, how="left", on="AID")
#inspect the data
df.head(3)

Unnamed: 0,index,CID,AID,NO,CPA,BOV,BOA,NEU,Newspaper_x,Date,Length,Headline,Article_x,Author,Article_y,Newspaper_y
0,1,3,490,1,3,2,2,2,Aachener Zeitung,2020-05-15,43 words,FDP lädt ein zur Wahlversammlung,Simmerath Der FDP Ortsverein Simmerath lädt zu...,,,
1,2,3,3414,3,1,1,1,2,Stuttgarter Zeitung,2020-05-15,576 words,Wer stopft das Steuerloch?,Angesichts der gigantischen Steuer-ausfälle im...,Thorsten Knuf,,
2,3,3,6996,4,3,2,2,1,Der Tagesspiegel,2020-05-10,801 words,Schulbetrieb und Klassenfahrt,"""Kindeswohlgefährdung begünstigt. Experten bes...",,,


In [11]:
#fill empty columns
df["Article_x"]= df["Article_x"].fillna("")
df["Article_y"]= df["Article_y"].fillna("") 
#create a unified text column
df["Article"] = df["Article_x"] + df["Article_y"]

#fill empty columns
df["Newspaper_x"]= df["Newspaper_x"].fillna("")
df["Newspaper_y"]= df["Newspaper_y"].fillna("") 
#create a unified text column
df["Newspaper"] = df["Newspaper_x"] + df["Newspaper_y"]

#inspect df
df.head(3)

Unnamed: 0,index,CID,AID,NO,CPA,BOV,BOA,NEU,Newspaper_x,Date,Length,Headline,Article_x,Author,Article_y,Newspaper_y,Article,Newspaper
0,1,3,490,1,3,2,2,2,Aachener Zeitung,2020-05-15,43 words,FDP lädt ein zur Wahlversammlung,Simmerath Der FDP Ortsverein Simmerath lädt zu...,,,,Simmerath Der FDP Ortsverein Simmerath lädt zu...,Aachener Zeitung
1,2,3,3414,3,1,1,1,2,Stuttgarter Zeitung,2020-05-15,576 words,Wer stopft das Steuerloch?,Angesichts der gigantischen Steuer-ausfälle im...,Thorsten Knuf,,,Angesichts der gigantischen Steuer-ausfälle im...,Stuttgarter Zeitung
2,3,3,6996,4,3,2,2,1,Der Tagesspiegel,2020-05-10,801 words,Schulbetrieb und Klassenfahrt,"""Kindeswohlgefährdung begünstigt. Experten bes...",,,,"""Kindeswohlgefährdung begünstigt. Experten bes...",Der Tagesspiegel


In [12]:
len(df)

575

In [13]:
df["Article"].isnull().sum()

0

## Remove duplicates

In [14]:
#remove duplicates
df.drop_duplicates(subset ="AID", keep = "first", inplace = True) 
len(df)

497

In [15]:
#see if there is missing data
df = df[df["Article"].notna()]
len(df)

497

## Remove codes that don't align with the original data

In [16]:
df = df[df['Newspaper'] != ""]
len(df)

487

## Adapt column values

In [17]:
df["NEU"] = df["NEU"].replace([2, 1], [1, 0])
df["BOV"] = df["BOA"].replace(2, 0)
df["BOA"] = df["BOA"].replace(2, 0)
#inspect data
df.head(3)

Unnamed: 0,index,CID,AID,NO,CPA,BOV,BOA,NEU,Newspaper_x,Date,Length,Headline,Article_x,Author,Article_y,Newspaper_y,Article,Newspaper
0,1,3,490,1,3,0,0,1,Aachener Zeitung,2020-05-15,43 words,FDP lädt ein zur Wahlversammlung,Simmerath Der FDP Ortsverein Simmerath lädt zu...,,,,Simmerath Der FDP Ortsverein Simmerath lädt zu...,Aachener Zeitung
1,2,3,3414,3,1,1,1,1,Stuttgarter Zeitung,2020-05-15,576 words,Wer stopft das Steuerloch?,Angesichts der gigantischen Steuer-ausfälle im...,Thorsten Knuf,,,Angesichts der gigantischen Steuer-ausfälle im...,Stuttgarter Zeitung
2,3,3,6996,4,3,0,0,0,Der Tagesspiegel,2020-05-10,801 words,Schulbetrieb und Klassenfahrt,"""Kindeswohlgefährdung begünstigt. Experten bes...",,,,"""Kindeswohlgefährdung begünstigt. Experten bes...",Der Tagesspiegel


## Text cleaning

In [18]:
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('german')) 
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("german")
from collections import Counter

#function to remove stopwords
def remove_stopwords_and_stem(text):
    #apply SpaCy to tokenise
    doc = nlp(text)
    #create a list of tokens
    tokens = [token.text for token in doc]
    #create an empty list for all words in a text that are not stopwords
    no_stopwords = []
    #loop over the tokens list and append non-stopwords to the no_stopwords list
    for w in tokens: 
        if w not in stop_words: 
            no_stopwords.append(w)
    #stem all words in the list
    stems=""
    for word in no_stopwords:
        stems=stems + stemmer.stem(word) + " "
    #return the final stems as a single string
    return(stems)

In [19]:
#... create a column with the cleaned article text (where stopwords are removed and all words are stemmed)
df["clean text"] = [remove_stopwords_and_stem(text) for text in df["Article"]]

In [20]:
import ftfy
#clean the text with 
def fixtext(string):
    return ftfy.fix_text(string)
df["Article"] = df["Article"].apply(fixtext)

## Splitting the data into test and training set
I reran the following models with both the original and the cleaned text. I indicated which one led to better results for each classifier.

In [21]:
df["NEU"].value_counts()

1    283
0    204
Name: NEU, dtype: int64

In [113]:
#create training and testing dataset 
x_train, x_test, y_train, y_test = train_test_split(df["Article"], df.NEU, test_size=0.2, random_state=1)

# Feature engineering
   
I used three different types of text representations for the classifier training, namely count vectors, TF-IDF vectors, and TF-IDF vectors with n-grams. All classifiers were trained and tested on all three features. 

## TF-IDF Vectors
In the following, Term Frequency-Inverse Document Frequency is applied in order to represent the text data as a vector that can be used as numerical input for a SML algorithm.   
   
I chose the following parameters, which are specified in the code below:
- The text is represented as unigrams and bigrams, meaning not just single words, but also sets of two neighboring words are accounted for -> ngram_range
- Terms that appear in less than 10 documents are ignored -> min_df
- All other terms are included -> max_df
- In total, up to 200 features can be extracted per text -> max_features

In [114]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=200)

#features (=y_train)
features_train_tfidf = tfidf_vect.fit_transform(x_train).toarray()
labels_train_tfidf = y_train
print(features_train_tfidf.shape)

#features (=y_train)
features_test_tfidf = tfidf_vect.fit_transform(x_test).toarray()
labels_test_tfidf = y_test
print(features_test_tfidf.shape)

(389, 200)
(98, 200)


In [115]:
# Parameter selection
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 200

#Defining the TfidfVectorizer with the parameters above
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None, #already removed
                        lowercase=False, #already done
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)

#features (=x_train)
features_train_tfidf_ngrams = tfidf.fit_transform(x_train).toarray()
#labels (=x_test)
labels_train_tfidf_ngrams = y_train
print(features_train_tfidf_ngrams.shape)

#features (=y_train)
features_test_tfidf_ngrams = tfidf.transform(x_test).toarray()
#labels (=y_test)
labels_test_tfidf_ngrams = y_test
print(features_test_tfidf_ngrams.shape)

(389, 200)
(98, 200)


### Count Vectors
   
transform the training and testing data using count vectorizer object


In [116]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer="word", token_pattern=r"\w{1,}", max_features= 200)

#features (=y_train)
features_train_count = count_vect.fit_transform(x_train).toarray()
labels_train_count = y_train
print(features_train_count.shape)

#features (=y_train)
features_test_count = count_vect.fit_transform(x_test).toarray()
labels_test_count = y_test
print(features_test_count.shape)

(389, 200)
(98, 200)


# SML: Naive Bayes Classifier

*Important:* The naive bayes classifier achieves better results with the cleaned article text than with the original text.

### fit the data

In [54]:
#define the models
nbc_count_gnb = GaussianNB()
nbc_tfidf_gnb = GaussianNB()
nbc_tfidf_ngrams_gnb = GaussianNB()

nbc_count_mnb = MultinomialNB()
nbc_tfidf_mnb = MultinomialNB()
nbc_tfidf_ngrams_mnb = MultinomialNB()

In [55]:
# fit the training dataset on the classifiers
nbc_count_gnb.fit(features_train_count, labels_train_count)
nbc_tfidf_gnb.fit(features_train_tfidf, labels_train_tfidf)
nbc_tfidf_gnb.fit(features_train_tfidf, labels_train_tfidf_ngrams)
nbc_count_mnb.fit(features_train_count, labels_train_count)
nbc_tfidf_mnb.fit(features_train_tfidf, labels_train_tfidf)
nbc_tfidf_mnb.fit(features_train_tfidf, labels_train_tfidf_ngrams)

# predict the labels on validation dataset
predictions_count_gnb = nbc_count_gnb.predict(features_test_count)
predictions_tfidf_gnb = nbc_tfidf_gnb.predict(features_test_tfidf)
predictions_tfidf_ngrams_gnb = nbc_tfidf_gnb.predict(features_test_tfidf_ngrams)
predictions_count_mnb = nbc_count_mnb.predict(features_test_count)
predictions_tfidf_mnb = nbc_tfidf_mnb.predict(features_test_tfidf)
predictions_tfidf_ngrams_mnb = nbc_tfidf_mnb.predict(features_test_tfidf_ngrams)

### assess accuracy

In [56]:
print("classification report - naive bayes classifier - GNB - count") 
print(classification_report(labels_test_count,predictions_count_gnb))
print("")
print("classification report - naive bayes classifier - GNB - tfidf") 
print(classification_report(labels_test_count,predictions_tfidf_gnb))
print("")
print("classification report - naive bayes classifier - GNB - tfidf") 
print(classification_report(labels_test_count,predictions_tfidf_ngrams_gnb))
print("")
print("classification report - naive bayes classifier - MNB _count") 
print(classification_report(labels_test_count,predictions_count_mnb))
print("")
print("classification report - naive bayes classifier - MNB - tfidf") 
print(classification_report(labels_test_count,predictions_tfidf_mnb))
print("")
print("classification report - naive bayes classifier - MNB - tfidf") 
print(classification_report(labels_test_count,predictions_tfidf_ngrams_mnb))

classification report - naive bayes classifier - GNB - count
              precision    recall  f1-score   support

           0       0.56      0.74      0.64        42
           1       0.74      0.57      0.65        56

    accuracy                           0.64        98
   macro avg       0.65      0.65      0.64        98
weighted avg       0.67      0.64      0.64        98


classification report - naive bayes classifier - GNB - tfidf
              precision    recall  f1-score   support

           0       0.42      0.55      0.47        42
           1       0.56      0.43      0.48        56

    accuracy                           0.48        98
   macro avg       0.49      0.49      0.48        98
weighted avg       0.50      0.48      0.48        98


classification report - naive bayes classifier - GNB - tfidf
              precision    recall  f1-score   support

           0       0.52      0.76      0.62        42
           1       0.73      0.48      0.58        5

### save the best model predictions

In [57]:
best_naive_bayes_classifier_results = predictions_count_gnb

# SML: Support Vector Machines
   
*Important:* The support vector machines achieve better results with the complete article text than with the cleaned text.

In [117]:
#create a support vector machine (with random state for reproducibility)
svc_0 =svm.SVC(random_state=8)
#examine parameters
print('Parameters currently in use:\n')
pprint(svc_0.get_params())

Parameters currently in use:

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 8,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}


### Randomized Search Cross Validation   
   
#### Defining the grid

In [118]:
# C
C = [.0001, .001, .01]

# gamma
gamma = [.0001, .001, .01, .1, 1, 10, 100]

# degree
degree = [1, 2, 3, 4, 5]

# kernel
kernel = ['linear', 'rbf', 'poly']

# probability
probability = [True]

# Create the random grid
random_grid = {'C': C,
              'kernel': kernel,
              'gamma': gamma,
              'degree': degree,
              'probability': probability
             }

pprint(random_grid)

{'C': [0.0001, 0.001, 0.01],
 'degree': [1, 2, 3, 4, 5],
 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
 'kernel': ['linear', 'rbf', 'poly'],
 'probability': [True]}


### Fitting the random search models

In [119]:
# First create the base model to tune
svc = svm.SVC(random_state=8)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=svc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

random_search_count = random_search
random_search_tfidf = random_search
random_search_tfidf_ngrams = random_search

### Fit random search models and assess their results

In [120]:
# Fit the random search models
random_search_count.fit(features_train_count, labels_train_count)

print("The best hyperparameters from Random Search for count vectors are:")
print(random_search_count.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search_count.best_score_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   12.3s finished


The best hyperparameters from Random Search for count vectors are:
{'probability': True, 'kernel': 'poly', 'gamma': 10, 'degree': 1, 'C': 0.0001}

The mean accuracy of a model with these hyperparameters is:
0.6915324985092427


In [121]:
random_search_tfidf.fit(features_train_tfidf, labels_train_tfidf)

print("The best hyperparameters from Random Search for tfidf vectors are:")
print(random_search_tfidf.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search_tfidf.best_score_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   11.5s finished


The best hyperparameters from Random Search for tfidf vectors are:
{'probability': True, 'kernel': 'poly', 'gamma': 10, 'degree': 4, 'C': 0.01}

The mean accuracy of a model with these hyperparameters is:
0.681315841780958


In [122]:
random_search_tfidf_ngrams.fit(features_train_tfidf_ngrams, labels_train_tfidf_ngrams)

print("The best hyperparameters from Random Search for tfidf vectors with ngrams are:")
print(random_search_tfidf_ngrams.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search_tfidf_ngrams.best_score_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   11.4s finished


The best hyperparameters from Random Search for tfidf vectors with ngrams are:
{'probability': True, 'kernel': 'poly', 'gamma': 10, 'degree': 4, 'C': 0.01}

The mean accuracy of a model with these hyperparameters is:
0.7276088252832439


### Incorporating the results of the random search

In [130]:
# Create the parameter grid based on the results of random search 
C = [.001, .01, .1]
degree = [3, 4, 5]
gamma = [1, 10, 100]
probability = [True]

param_grid = [
  {'C': C, 'kernel':['linear'], 'probability':probability},
  {'C': C, 'kernel':['poly'], 'degree':degree, 'probability':probability},
  {'C': C, 'kernel':['rbf'], 'gamma':gamma, 'probability':probability}
]

# Create a base model
svc = svm.SVC(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .2, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train_tfidf_ngrams, labels_train_tfidf_ngrams)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  63 out of  63 | elapsed:    7.3s finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.2, train_size=None),
             estimator=SVC(random_state=8),
             param_grid=[{'C': [0.001, 0.01, 0.1], 'kernel': ['linear'],
                          'probability': [True]},
                         {'C': [0.001, 0.01, 0.1], 'degree': [3, 4, 5],
                          'kernel': ['poly'], 'probability': [True]},
                         {'C': [0.001, 0.01, 0.1], 'gamma': [1, 10, 100],
                          'kernel': ['rbf'], 'probability': [True]}],
             scoring='accuracy', verbose=1)

In [131]:
#print the best hyperparameters for the model
print("The best hyperparameters from Grid Search are:")
print(grid_search.best_params_)
#print the mean accuracy score of the model with the best hyperparameters
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

The best hyperparameters from Grid Search are:
{'C': 0.1, 'degree': 4, 'kernel': 'poly', 'probability': True}

The mean accuracy of a model with these hyperparameters is:
0.6752136752136751


In [132]:
#select and save the best model
best_svc = grid_search.best_estimator_
#inspect the model
best_svc

SVC(C=0.1, degree=4, kernel='poly', probability=True, random_state=8)

In [133]:
#fit the model to the training data
best_svc.fit(features_train_count, labels_train_count)

#get predictions
svc_pred = best_svc.predict(features_test_count)

In [135]:
print("classification report - support vector machines - tfidf ngrams") 
print(classification_report(labels_test_tfidf_ngrams,svc_pred))

classification report - support vector machines - tfidf ngrams
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        42
           1       0.57      1.00      0.73        56

    accuracy                           0.57        98
   macro avg       0.29      0.50      0.36        98
weighted avg       0.33      0.57      0.42        98



### Comparing the new model to the default model for count and tfidf_ngram representations

In [149]:
base_model = svm.SVC(random_state = 8)
base_model.fit(features_train_count, labels_train_count)
accuracy_score(labels_test_count, base_model.predict(features_test_count))

0.7551020408163265

In [150]:
best_svc.fit(features_train_count, labels_train_count)
accuracy_score(labels_test_count, best_svc.predict(features_test_count))

0.5714285714285714

In [139]:
base_model = svm.SVC(random_state = 8)
base_model.fit(features_train_tfidf_ngrams, labels_train_tfidf_ngrams)
accuracy_score(labels_test_tfidf_ngrams, base_model.predict(features_test_tfidf_ngrams))

0.7448979591836735

In [143]:
best_svc.fit(features_train_tfidf_ngrams, labels_train_tfidf_ngrams)
accuracy_score(labels_test_tfidf_ngrams, best_svc.predict(features_test_tfidf_ngrams))

0.7142857142857143

### Saving the best model results

In [152]:
best_svm_classifier_results = base_model.predict(features_test_count)

In [153]:
print("classification report - support vector machines - base model - count") 
print(classification_report(labels_test_count,best_svm_classifier_results))

classification report - support vector machines - base model - count
              precision    recall  f1-score   support

           0       0.68      0.81      0.74        42
           1       0.83      0.71      0.77        56

    accuracy                           0.76        98
   macro avg       0.76      0.76      0.75        98
weighted avg       0.77      0.76      0.76        98



# SML: K-nearest neighbour
   
*Important*: The k nearest neighbour classifier performed better on the cleaned text 

## Cross-Validation for Hyperparameter tuning

In [33]:
#defining the algorithm and inspecting its parameters
knnc_0 =KNeighborsClassifier()

print('Parameters currently in use:\n')
pprint(knnc_0.get_params())

Parameters currently in use:

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}


### Grid Search Cross Validation

In [34]:
# Create the parameter grid 
n_neighbors = [int(x) for x in np.linspace(start = 1, stop = 300, num = 100)]

param_grid = {"n_neighbors": n_neighbors}

# Create a base model
knnc = KNeighborsClassifier()

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .2, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=knnc, 
                           param_grid=param_grid,
                           scoring="accuracy",
                           cv=cv_sets,
                           verbose=1)

grid_search_count = grid_search
grid_search_tfidf = grid_search
grid_search_tfidf_ngrams = grid_search

In [35]:
grid_search_count.fit(features_train_count, labels_train_count)

print("The best hyperparameters from Grid Search for count vectors are:")
print(grid_search_tfidf_ngrams.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search_tfidf_ngrams.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    3.9s finished


The best hyperparameters from Grid Search for count vectors are:
{'n_neighbors': 19}

The mean accuracy of a model with these hyperparameters is:
0.6239316239316239


In [36]:
grid_search_tfidf.fit(features_train_tfidf, labels_train_tfidf)

print("The best hyperparameters from Grid Search for tfidf vectors are:")
print(grid_search_tfidf_ngrams.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search_tfidf_ngrams.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    3.7s finished


The best hyperparameters from Grid Search for tfidf vectors are:
{'n_neighbors': 124}

The mean accuracy of a model with these hyperparameters is:
0.7051282051282052


In [37]:
grid_search_tfidf_ngrams.fit(features_train_tfidf_ngrams, labels_train_tfidf_ngrams)

print("The best hyperparameters from Grid Search for tfidf vectors with ngrams are:")
print(grid_search_tfidf_ngrams.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search_tfidf_ngrams.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    3.8s finished


The best hyperparameters from Grid Search for tfidf vectors with ngrams are:
{'n_neighbors': 179}

The mean accuracy of a model with these hyperparameters is:
0.7094017094017094


In [38]:
n_neighbors = [170,171,172,173,174,175,176,177,178,178,179,180,181,182,183,184,185,186,187,188]
param_grid = {'n_neighbors': n_neighbors}

knnc = KNeighborsClassifier()
cv_sets = ShuffleSplit(n_splits = 3, test_size = .2, random_state = 8)

grid_search = GridSearchCV(estimator=knnc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

grid_search.fit(features_train_tfidf_ngrams, labels_train_tfidf_ngrams)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    0.2s finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.2, train_size=None),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [170, 171, 172, 173, 174, 175, 176, 177,
                                         178, 178, 179, 180, 181, 182, 183, 184,
                                         185, 186, 187, 188]},
             scoring='accuracy', verbose=1)

In [39]:
#save the best model
best_knnc = grid_search.best_estimator_
#inspect the best model
best_knnc

KNeighborsClassifier(n_neighbors=180)

## Model fit and performance

In [40]:
best_knnc.fit(features_train_tfidf_ngrams, labels_train_tfidf_ngrams)

KNeighborsClassifier(n_neighbors=180)

In [41]:
knnc_pred = best_knnc.predict(features_test_tfidf_ngrams)

In [42]:
print("classification report - K nearest neighbour - 13 neighbours - count vectors") 
print(classification_report(labels_test_count,best_knnc.predict(features_test_tfidf_ngrams)))

classification report - K nearest neighbour - 13 neighbours - count vectors
              precision    recall  f1-score   support

           0       0.70      0.71      0.71        42
           1       0.78      0.77      0.77        56

    accuracy                           0.74        98
   macro avg       0.74      0.74      0.74        98
weighted avg       0.75      0.74      0.75        98



### Compare the new model to the default model

In [43]:
base_model = KNeighborsClassifier()
base_model.fit(features_train_tfidf_ngrams, labels_train_tfidf_ngrams)
accuracy_score(labels_test_tfidf_ngrams, base_model.predict(features_test_tfidf_ngrams))

0.6530612244897959

In [44]:
best_knnc.fit(features_train_tfidf_ngrams, labels_train_tfidf_ngrams)
accuracy_score(labels_test_tfidf_ngrams, best_knnc.predict(features_test_tfidf_ngrams))

0.7448979591836735

### Save the best classification results for later comparison

In [45]:
best_knn_classifier_results = best_knnc.predict(features_test_tfidf_ngrams)

# SML: Stochastic Gradient Descent Classifier
   
*Important:* The SGL classifier performs best with the clean data.

In [47]:
classifier = SGDClassifier(loss="hinge", alpha=0.01, max_iter=200)

In [48]:
classifier.fit(features_train_count, labels_train_count)
count_pred = classifier.predict(features_test_count)
print("classification report - SGD classifier - count vectors") 
print(classification_report(labels_test_count, count_pred))

classification report - SGD classifier - count vectors
              precision    recall  f1-score   support

           0       0.43      0.52      0.47        42
           1       0.57      0.48      0.52        56

    accuracy                           0.50        98
   macro avg       0.50      0.50      0.50        98
weighted avg       0.51      0.50      0.50        98



In [49]:
classifier.fit(features_train_tfidf, labels_train_tfidf)
tfidf_pred = classifier.predict(features_test_tfidf)
print("classification report - SGD classifier - tfidf vectors") 
print(classification_report(labels_test_tfidf, tfidf_pred))

classification report - SGD classifier - tfidf vectors
              precision    recall  f1-score   support

           0       0.50      0.10      0.16        42
           1       0.58      0.93      0.71        56

    accuracy                           0.57        98
   macro avg       0.54      0.51      0.44        98
weighted avg       0.54      0.57      0.48        98



In [50]:
classifier.fit(features_train_tfidf_ngrams, labels_train_tfidf_ngrams)
tfidf_ngrams_pred = classifier.predict(features_test_tfidf_ngrams)
print("classification report - SGD classifier - tfidf vectors with ngrams") 
print(classification_report(labels_test_tfidf_ngrams, tfidf_ngrams_pred))

classification report - SGD classifier - tfidf vectors with ngrams
              precision    recall  f1-score   support

           0       0.80      0.57      0.67        42
           1       0.74      0.89      0.81        56

    accuracy                           0.76        98
   macro avg       0.77      0.73      0.74        98
weighted avg       0.76      0.76      0.75        98



## Final Model comparison

In [154]:
print("classification report - naive bayes classifier - MNB - count") 
print(classification_report(labels_test_count, best_naive_bayes_classifier_results))
print("")
print("classification report - support vector machines - base model - count") 
print(classification_report(labels_test_count,best_svm_classifier_results))
print("")
print("classification report - k nearest neighbour - 180 neighbours - tfidf vectors with ngrams") 
print(classification_report(labels_test_count,best_knn_classifier_results))
print("classification report - SGD classifier - tfidf vectors with ngrams") 
print(classification_report(labels_test_tfidf_ngrams, tfidf_ngrams_pred))

classification report - naive bayes classifier - MNB - count
              precision    recall  f1-score   support

           0       0.56      0.74      0.64        42
           1       0.74      0.57      0.65        56

    accuracy                           0.64        98
   macro avg       0.65      0.65      0.64        98
weighted avg       0.67      0.64      0.64        98


classification report - support vector machines - base model - count
              precision    recall  f1-score   support

           0       0.68      0.81      0.74        42
           1       0.83      0.71      0.77        56

    accuracy                           0.76        98
   macro avg       0.76      0.76      0.75        98
weighted avg       0.77      0.76      0.76        98


classification report - k nearest neighbour - 180 neighbours - tfidf vectors with ngrams
              precision    recall  f1-score   support

           0       0.70      0.71      0.71        42
           1    