## Open Dataset
Import Pandas and import csv

In [1]:
import pandas
import numpy
from joblib import dump, load

dataset = pandas.read_csv('dataset.csv')
#Remove non-important features
dataset = dataset.drop(['Unnamed: 0', 'news_link', 'group_id', 'num_sent', 'article', 'Label_opinion'], axis=1)
dataset.rename(columns = {'sentence':'origin_sentence'}, inplace = True)
dataset.head()

Unnamed: 0,origin_sentence,outlet,topic,type,Label_bias,biased_words4
0,YouTube is making clear there will be no “birt...,usa-today,elections-2020,center,Biased,"['belated', 'birtherism']"
1,The increasingly bitter dispute between Americ...,msnbc,sport,left,Non-biased,['bitter']
2,So while there may be a humanitarian crisis dr...,alternet,immigration,left,Biased,['crisis']
3,A professor who teaches climate change classes...,breitbart,environment,right,Non-biased,['legitimate']
4,"Looking around the United States, there is nev...",federalist,abortion,right,Biased,"['killing', 'never', 'developing', 'humans', '..."


## Remove No agreement values & Convert Label to Binary
1 is Biased, 0 is Non-Biased

In [2]:
dataset = dataset[dataset['Label_bias']!="No agreement"].reset_index()
dataset['label'] = [1 if val=="Biased" else 0 for val in dataset['Label_bias']]
dataset.head()

Unnamed: 0,index,origin_sentence,outlet,topic,type,Label_bias,biased_words4,label
0,0,YouTube is making clear there will be no “birt...,usa-today,elections-2020,center,Biased,"['belated', 'birtherism']",1
1,1,The increasingly bitter dispute between Americ...,msnbc,sport,left,Non-biased,['bitter'],0
2,2,So while there may be a humanitarian crisis dr...,alternet,immigration,left,Biased,['crisis'],1
3,3,A professor who teaches climate change classes...,breitbart,environment,right,Non-biased,['legitimate'],0
4,4,"Looking around the United States, there is nev...",federalist,abortion,right,Biased,"['killing', 'never', 'developing', 'humans', '...",1


## Prepare set with sentence metadata

In [3]:
from sklearn import preprocessing

outlets = preprocessing.LabelEncoder().fit(dataset['outlet'])
topics = preprocessing.LabelEncoder().fit(dataset['topic'])
types = preprocessing.LabelEncoder().fit(dataset['type'])

metaset = pandas.DataFrame(data={'outlet': outlets.transform(dataset['outlet']),
                                 'topic': topics.transform(dataset['topic']),
                                 'type': types.transform(dataset['type']),
                                 'label_bias': dataset['label']})

dump(metaset, 'Joblibs/metaset.joblib')

metaset.head()

Unnamed: 0,outlet,topic,type,label_bias
0,7,2,0,1
1,5,9,1,0
2,0,6,1,1
3,1,3,2,0
4,2,0,2,1


## Text Processing
Prepare for stemming and lemmatizing

In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
excluded = ['.', ',', '“', '”', '’', '–', '—', '―', '(', ')', ';', '--', '``', "''"]

## Stemming
Use snowball stemmer and generate a TF-IDF Confusion Matrix 

In [5]:
from nltk.stem import SnowballStemmer

stem_word_matrix = pandas.DataFrame(data=dataset['origin_sentence'])
total_terms = {}
stemmer = SnowballStemmer('english')

#Generate DataFrame With TF Confusion Matrix
row = 0
for sentence in stem_word_matrix['origin_sentence']:
    words = word_tokenize(sentence)
    word_num = 0
    doc_terms = {}
    terms_found = []
    for word in words:
        if word not in stopwords and word not in excluded:
            word_stem = stemmer.stem(word)
            word_num += 1
            word = word_stem.lower()
            if word in doc_terms:
                doc_terms[word] += 1
            else:
                doc_terms[word] = 1
            if word not in terms_found:
                terms_found.append(word)
                if word in total_terms:
                    total_terms[word] += 1
                else:
                    total_terms[word] = 1
    for word in doc_terms.keys():
        stem_word_matrix.at[row, word] = doc_terms[word]/word_num
    row += 1
stem_word_matrix = stem_word_matrix.fillna(0)

#Calculate IDF
for term in total_terms.keys():
    stem_word_matrix[term] *= numpy.log(stem_word_matrix['origin_sentence'].count() / total_terms[term])
    
stem_word_matrix['label_bias'] = dataset['label']

dump(stem_word_matrix, 'Joblibs/stem_word_matrix.joblib')

stem_word_matrix

Unnamed: 0,origin_sentence,youtub,make,clear,birther,platform,year,u.s.,presidenti,elect,...,lord,rob,hesit,hysteria,ratif,rodney,haaland,self-describ,bemoan,label_bias
0,YouTube is making clear there will be no “birt...,0.317691,0.168119,0.238283,0.432156,0.291104,0.159525,0.149087,0.205677,0.191312,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,1
1,The increasingly bitter dispute between Americ...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.093869,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0
2,So while there may be a humanitarian crisis dr...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,1
3,A professor who teaches climate change classes...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0
4,"Looking around the United States, there is nev...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1546,In every case legislators are being swarmed by...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.408148,0.408148,0.408148,0.00000,0.00000,0.000000,0.000000,1
1547,Polls show the transgender ideology is deeply ...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,1
1548,Democrats and Republicans stood and applauded ...,0.000000,0.124262,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.31942,0.31942,0.000000,0.000000,0
1549,"As a self-described Democratic socialist, Sen....",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.667878,0.000000,0


## Lemmatizing
Use WordNetLemmatizer and generate a TF-IDF Confusion Matrix

In [6]:
from nltk.stem import WordNetLemmatizer

lem_word_matrix = pandas.DataFrame(data=dataset['origin_sentence'])
total_terms = {}
lemmatizer = WordNetLemmatizer()

row = 0
for sentence in lem_word_matrix['origin_sentence']:
    words = word_tokenize(sentence)
    word_num = 0
    doc_terms = {}
    terms_found = []
    for word in words:
        if word not in stopwords and word not in excluded:
            word_lem = lemmatizer.lemmatize(word)
            word_num += 1
            word = word_lem.lower()
            if word in doc_terms:
                doc_terms[word] += 1
            else:
                doc_terms[word] = 1
            if word not in terms_found:
                terms_found.append(word)
                if word in total_terms:
                    total_terms[word] += 1
                else:
                    total_terms[word] = 1
    for word in doc_terms.keys():
        lem_word_matrix.at[row, word] = doc_terms[word]/word_num
    row += 1
lem_word_matrix = lem_word_matrix.fillna(0)

#Calculate IDF
for term in total_terms.keys():
        lem_word_matrix[term] *= numpy.log(lem_word_matrix['origin_sentence'].count() / total_terms[term])

lem_word_matrix['label_bias'] = dataset['label']

dump(lem_word_matrix, 'Joblibs/lem_word_matrix.joblib')

lem_word_matrix

Unnamed: 0,origin_sentence,youtube,making,clear,birtherism,platform,year,u.s.,presidential,election,...,happening,polls,rodney,davis,saluted,haaland,preside,self-described,bemoaned,label_bias
0,YouTube is making clear there will be no “birt...,0.317691,0.232086,0.265497,0.432156,0.291104,0.160099,0.149087,0.205677,0.204438,...,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,1
1,The increasingly bitter dispute between Americ...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.093869,0.000000,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0
2,So while there may be a humanitarian crisis dr...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,1
3,A professor who teaches climate change classes...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0
4,"Looking around the United States, there is nev...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1546,In every case legislators are being swarmed by...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.408148,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,1
1547,Polls show the transgender ideology is deeply ...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.734666,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,1
1548,Democrats and Republicans stood and applauded ...,0.000000,0.171542,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.31942,0.31942,0.31942,0.31942,0.31942,0.000000,0.000000,0
1549,"As a self-described Democratic socialist, Sen....",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.667878,0.000000,0


## Create metrics and split datasets as needed

In [7]:
from sklearn.model_selection import cross_validate

scoring = ['precision_macro', 'recall_macro']

try:
    metaset
    stem_word_matrix
    lem_word_matrix
except NameError:
    metaset = load('Joblibs/metaset.joblib')
    stem_word_matrix = load('Joblibs/stem_word_matrix.joblib')
    lem_word_matrix = load('Joblibs/lem_word_matrix.joblib')

metaset_data = metaset.drop(['label_bias'], axis=1)
metaset_label = metaset['label_bias']
stem_word_matrix_data = stem_word_matrix.drop(['label_bias', 'origin_sentence'], axis=1)
stem_word_matrix_label = stem_word_matrix['label_bias']
lem_word_matrix_data = lem_word_matrix.drop(['label_bias', 'origin_sentence'], axis=1)
lem_word_matrix_label = lem_word_matrix['label_bias']

## Decision Trees

In [8]:
from sklearn.tree import DecisionTreeClassifier

#Parameters
dt = DecisionTreeClassifier(criterion='gini', # gini | entropy
                            splitter='best', # best | random
                            max_depth=5, # int | None
                            max_features='auto') # auto | sqrt | log2 | None 

scores = cross_validate(dt, metaset_data, metaset_label, scoring=scoring, cv=10)
dt_scores = {}
for score in scores:
    summ = 0
    for value in scores[score]:
        summ += value
    dt_scores[score] = summ / 10

dump(dt, 'Joblibs/decision_trees.joblib')
    
dt_scores

{'fit_time': 0.0032447099685668944,
 'score_time': 0.004186248779296875,
 'test_precision_macro': 0.7217190424374867,
 'test_recall_macro': 0.6476015933076844}

## K-Nearest Neighbours

In [9]:
from sklearn.neighbors import KNeighborsClassifier

#Parameters
knns = KNeighborsClassifier(n_neighbors=20, # int
                            weights='uniform', # uniform | distance
                            algorithm='ball_tree', # auto | ball_tree | kd_tree | brute
                            leaf_size=30, # int
                            p=2, # manhattan_dist (1) | euclidean_dist (2) | minkowski_dist(3)
                            n_jobs=2) # threads

scores = cross_validate(knns, metaset_data, metaset_label.values.ravel(), scoring=scoring, cv=10)
knn_scores = {}
for score in scores:
    summ = 0
    for value in scores[score]:
        summ += value
    knn_scores[score] = summ / 10
    
dump(knns, 'Joblibs/k-nearest_neighbors.joblib')

knn_scores

{'fit_time': 0.005458831787109375,
 'score_time': 0.26069231033325196,
 'test_precision_macro': 0.7209633209334845,
 'test_recall_macro': 0.6926802086423959}

## Gradient Boosted Decision Trees

### With Stemmatization

In [10]:
from sklearn.ensemble import GradientBoostingClassifier

#Parameters
gbdt_stem = GradientBoostingClassifier(loss='deviance', # deviance | exponential
                                       learning_rate=0.01, # float
                                       n_estimators=80, # int
                                       subsample=0.4, # float
                                       max_depth=8) # int

scores = cross_validate(gbdt_stem, stem_word_matrix_data, stem_word_matrix_label, scoring=scoring, cv=10)
gbdt_stem_scores = {}
for score in scores:
    summ = 0
    for value in scores[score]:
        summ += value
    gbdt_stem_scores[score] = summ / 10

dump(gbdt_stem, 'Joblibs/gradient_boosted_stem.joblib')
    
gbdt_stem_scores

{'fit_time': 7.846212577819824,
 'score_time': 0.059369778633117674,
 'test_precision_macro': 0.7142249552756057,
 'test_recall_macro': 0.5478499524018254}

### With Lemmatization

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

#Parameters
gbdt_lem = GradientBoostingClassifier(loss='exponential', # deviance | exponential
                                      learning_rate=0.01, # float
                                      n_estimators=80, # int
                                      subsample=0.4, # float
                                      max_depth=2) # int

scores = cross_validate(gbdt_lem, lem_word_matrix_data, lem_word_matrix_label, scoring=scoring, cv=10)
gbdt_lem_scores = {}
for score in scores:
    summ = 0
    for value in scores[score]:
        summ += value
    gbdt_lem_scores[score] = summ / 10

dump(gbdt_lem, 'Joblibs/gradient_boosted_lem.joblib')
    
gbdt_lem_scores

{'fit_time': 3.2826137065887453,
 'score_time': 0.07647171020507812,
 'test_precision_macro': 0.7223479750787789,
 'test_recall_macro': 0.527427629000414}

## Multi-Layer Perceptron

### With Stemmatization

In [12]:
from sklearn.neural_network import MLPClassifier

#Parameters
mlp_stem = MLPClassifier(hidden_layer_sizes=(200, 100, 50), # tuple | int
                         activation='logistic', # identity | logistic | tanh | relu
                         solver='adam', #lbfgs | sgd | adam
                         learning_rate='adaptive', # constant | invscaling | adaptive
                         max_iter=200) # int

scores = cross_validate(mlp_stem, stem_word_matrix_data, stem_word_matrix_label, scoring=scoring, cv=10)
mlp_stem_scores = {}
for score in scores:
    summ = 0
    for value in scores[score]:
        summ += value
    mlp_stem_scores[score] = summ / 10

dump(mlp_stem, 'Joblibs/multi_layer_perceptron_stem.joblib')
    
mlp_stem_scores

{'fit_time': 93.04257156848908,
 'score_time': 0.12363924980163574,
 'test_precision_macro': 0.6995379831207054,
 'test_recall_macro': 0.6701313111333331}

### With Lemmatization

In [13]:
from sklearn.neural_network import MLPClassifier

#Parameters
mlp_lem = MLPClassifier(hidden_layer_sizes=(750, 500, 250, 100), # tuple | int
                        activation='tanh', # identity | logistic | tanh | relu
                        solver='adam', #lbfgs | sgd | adam
                        learning_rate='adaptive', # constant | invscaling | adaptive
                        max_iter=300) # int

scores = cross_validate(mlp_lem, lem_word_matrix_data, lem_word_matrix_label, scoring=scoring, cv=10)
mlp_lem_scores = {}
for score in scores:
    summ = 0
    for value in scores[score]:
        summ += value
    mlp_lem_scores[score] = summ / 10

dump(mlp_lem, 'Joblibs/multi_layer_perceptron_lem.joblib')    

mlp_lem_scores

{'fit_time': 67.97297763824463,
 'score_time': 0.24845488071441652,
 'test_precision_macro': 0.6991661873328471,
 'test_recall_macro': 0.6336320276496759}

## Voting Classifier
### Create a dataset to rule them all

In [14]:
data = {"origin_sentence": dataset["origin_sentence"]}

for column in metaset_data:
    data["meta_"+column] = metaset_data[column]
for column in stem_word_matrix_data:
    data["stem_"+column] = stem_word_matrix_data[column]
print(len(data))
for column in lem_word_matrix_data:
    data["lem_"+column] = lem_word_matrix_data[column]
data['label_bias'] = dataset['label']

big_data = pandas.DataFrame(data=data)

dump(big_data, 'Joblibs/big_data.joblib')

big_data

6302


Unnamed: 0,origin_sentence,meta_outlet,meta_topic,meta_type,stem_youtub,stem_make,stem_clear,stem_birther,stem_platform,stem_year,...,lem_happening,lem_polls,lem_rodney,lem_davis,lem_saluted,lem_haaland,lem_preside,lem_self-described,lem_bemoaned,label_bias
0,YouTube is making clear there will be no “birt...,7,2,0,0.317691,0.168119,0.238283,0.432156,0.291104,0.159525,...,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,1
1,The increasingly bitter dispute between Americ...,5,9,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0
2,So while there may be a humanitarian crisis dr...,0,6,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,1
3,A professor who teaches climate change classes...,1,3,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0
4,"Looking around the United States, there is nev...",2,0,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1546,In every case legislators are being swarmed by...,0,4,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.408148,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,1
1547,Polls show the transgender ideology is deeply ...,1,4,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.734666,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,1
1548,Democrats and Republicans stood and applauded ...,7,4,0,0.000000,0.124262,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.31942,0.31942,0.31942,0.31942,0.31942,0.000000,0.000000,0
1549,"As a self-described Democratic socialist, Sen....",3,8,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.667878,0.000000,0


### Implement voting classifier through pipeline

In [15]:
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from mlxtend.feature_selection import ColumnSelector
from mlxtend.classifier import EnsembleVoteClassifier

try:
    big_data
    dt
    knns
    gbdt_stem
    gbdt_lem
    mlp_stem
    mlp_lem
except NameError:
    big_data = load('Joblibs/big_data.joblib')
    dt = load('Joblibs/decision_trees.joblib')
    knns = load('Joblibs/k-nearest_neighbors.joblib')
    gbdt_stem = load('Joblibs/gradient_boosted_stem.joblib')
    gbdt_lem = load('Joblibs/gradient_boosted_lem.joblib')
    mlp_stem = load('Joblibs/multi_layer_perceptron_stem.joblib')
    mlp_lem = load('Joblibs/multi_layer_perceptron_lem.joblib')

classifiers = [make_pipeline(ColumnSelector(cols=(1, 3)), dt),
               make_pipeline(ColumnSelector(cols=(1, 3)), knns),
               make_pipeline(ColumnSelector(cols=(4, 6302)), gbdt_stem),
               make_pipeline(ColumnSelector(cols=(4, 6302)), mlp_stem),
               make_pipeline(ColumnSelector(cols=(6302, -1)), gbdt_lem),
               make_pipeline(ColumnSelector(cols=(6302, -1)), mlp_lem)
              ]
voters = EnsembleVoteClassifier(clfs=classifiers, voting='soft', weights=[1.4, 1.4, 1, 0.2, 0.2, 1])

scores = model_selection.cross_val_score(voters,
                                         big_data.drop(['label_bias'], axis=1),
                                         big_data['label_bias'],
                                         cv=10,
                                         scoring='accuracy')

print(f"Accuracy: %0.2f)" % (scores.mean()))

Accuracy: 0.73)


In [17]:
big_data.to_csv('final_dataset.csv', index=False)
lem_word_matrix.to_csv('lem_word.csv', index=False)
stem_word_matrix.to_csv('stem_word.csv', index=False)