# Assessing impartiality
   
In this notebook, I use three classifiers to predict the impartiality indicators neutrality, balance of actors, and balance of viewpoints, before combining them into an overall index. 

## Load packages

In [1]:
#import relevant packages
import pandas as pd
from pandas import read_excel
import joblib
import numpy as np
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, ShuffleSplit, GridSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

## Import classifiers

In [2]:
classifier_neu = joblib.load("classifier_neu.pkl")
classifier_boa = joblib.load("classifier_boa.pkl")
classifier_bov = joblib.load("classifier_bov.pkl")

## Read data

In [3]:
#read in the complete cleaned dataset
df = read_excel("complete_data_cleaned.xlsx")
mca = read_excel("mca_cleaned.xlsx")
#inspect data
print(len(df), len(mca))

11491 487


## Vectorise data

In [6]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer="word", 
                             token_pattern=r"\w{1,10}", 
                             min_df = 10, 
                             max_df = 1., 
                             max_features=200)

#features for training
count_vectors_clean = count_vect.fit_transform(df["clean text"]).toarray()
count_vectors_full = count_vect.fit_transform(df["Article"]).toarray()

#features for testing
count_vectors_clean2 = count_vect.fit_transform(mca["clean text"]).toarray()
count_vectors_full2 = count_vect.fit_transform(mca["Article"]).toarray()

#inspect the shape
print(count_vectors_clean.shape)
print(count_vectors_clean2.shape)

(11491, 200)
(487, 200)


In [7]:
# unigrams
tfidf_vect_ug = TfidfVectorizer(analyzer='word', 
                             token_pattern=r'\w{1,}',
                             ngram_range = (1,1),
                             min_df = 10, 
                             max_df = 1., 
                             max_features=200)

# bigrams
tfidf_vect_bg = TfidfVectorizer(analyzer='word', 
                             token_pattern=r'\w{1,}',
                             ngram_range = (2,2),
                             min_df = 10, 
                             max_df = 1., 
                             max_features=5) #otherwise there is an error

# unigrams and bigrams
tfidf_vect_ubg = TfidfVectorizer(analyzer='word', 
                             token_pattern=r'\w{1,}',
                             ngram_range = (1,2),
                             min_df = 10, 
                             max_df = 1., 
                             max_features=200)


#unigrams
tfidf_ug_vectors_clean = tfidf_vect_ug.fit_transform(df["clean text"]).toarray()
tfidf_ug_vectors_clean2 = tfidf_vect_ug.fit_transform(mca["clean text"]).toarray()
tfidf_ug_vectors_full = tfidf_vect_ug.fit_transform(df["Article"]).toarray()
tfidf_ug_vectors_full2 = tfidf_vect_ug.fit_transform(mca["Article"]).toarray()
#bigrams
tfidf_bg_vectors_clean = tfidf_vect_bg.fit_transform(df["clean text"]).toarray()
tfidf_bg_vectors_clean2 = tfidf_vect_bg.fit_transform(mca["clean text"]).toarray()
tfidf_bg_vectors_full = tfidf_vect_bg.fit_transform(df["Article"]).toarray()
tfidf_bg_vectors_full2 = tfidf_vect_bg.fit_transform(mca["Article"]).toarray()
#uni and bigrams
tfidf_ubg_vectors_clean = tfidf_vect_ubg.fit_transform(df["clean text"]).toarray()
tfidf_ubg_vectors_clean2 = tfidf_vect_ubg.fit_transform(mca["clean text"]).toarray()
tfidf_ubg_vectors_full = tfidf_vect_ubg.fit_transform(df["Article"]).toarray()
tfidf_ubg_vectors_full2 = tfidf_vect_ubg.fit_transform(mca["Article"]).toarray()

#inspect the shape
print(tfidf_ug_vectors_clean.shape)
print(tfidf_ug_vectors_clean2.shape)

(11491, 200)
(487, 200)


## Apply the classifiers

In [8]:
results = classifier_neu.predict(count_vectors_clean)
#save the results in a new column
df["neutrality dummy"] = results
#inspect the results
df["neutrality dummy"].value_counts()

0    5779
1    5712
Name: neutrality dummy, dtype: int64

In [9]:
results = classifier_boa.predict(tfidf_ubg_vectors_full)
#save the results in a new column
df["balance of actors dummy"] = results
#inspect the results
df["balance of actors dummy"].value_counts()

0    8410
1    3081
Name: balance of actors dummy, dtype: int64

In [10]:
results = classifier_boa.predict(count_vectors_full)
#save the results in a new column
df["balance of viewpoints dummy"] = results
#inspect the results
df["balance of viewpoints dummy"].value_counts()

1    7144
0    4347
Name: balance of viewpoints dummy, dtype: int64

## Create index

In [11]:
df["impartiality"] = df["balance of viewpoints dummy"] + df["balance of actors dummy"] + df["neutrality dummy"]

## Inspect data

In [12]:
df.groupby("Newspaper").mean()

Unnamed: 0_level_0,Unnamed: 0,ID,Length,words in clean text,reach_dummy,modality_dummy,neutrality dummy,balance of actors dummy,balance of viewpoints dummy,impartiality
Newspaper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Aachener Zeitung,492.615464,493.615464,484.008247,294.691753,0.0,1.0,0.586598,0.286598,0.61134,1.484536
Der Tagesspiegel,10927.010886,10928.010886,574.946345,345.008554,1.0,1.0,0.462675,0.250389,0.64619,1.359253
Die Welt,1407.464501,1408.464501,774.438026,458.566787,1.0,1.0,0.216606,0.268351,0.593261,1.078219
Rheinische Post,3486.029474,3487.029474,377.456,227.909895,0.0,1.0,0.616,0.227789,0.663579,1.507368
Stuttgarter Zeitung,5749.415521,5750.415521,394.241714,236.194826,0.0,1.0,0.586904,0.264349,0.599838,1.451091
Süddeutsche Zeitung (inkl. Regionalausgaben),8341.211828,8342.211828,529.366129,312.950806,1.0,1.0,0.428226,0.295699,0.620699,1.344624
aachener zeitung (www),4215.535714,104215.535714,401.488095,247.35119,0.0,0.0,0.595238,0.345238,0.52381,1.464286
der tagesspiegel (www),4147.094697,104147.094697,574.189394,340.988636,1.0,0.0,0.390152,0.204545,0.537879,1.132576
die welt (www),3980.39548,103980.39548,578.751412,346.711864,1.0,0.0,0.440678,0.293785,0.564972,1.299435
rheinische post (www),4260.479769,104260.479769,338.699422,206.323699,0.0,0.0,0.710983,0.248555,0.653179,1.612717


## Save data

In [14]:
df.to_excel("complete_data_cleaned_with_impartiality.xlsx")