## This script corresponds to section 3.3 - Evaluation of data sets - of the paper "Neural Media Bias Detection Using Distant Supervision With BABE"


To run this script, you need the following files found in the /data directory:
- "raw_labels_MBIC.xlsx"
- "raw_labels_SG1.xlsx"
- "raw_labels_SG2.xlsx"
- "final_labels_MBIC.xlsx"
- "final_labels_SG1.xlsx"
- "final_labels_SG2.xlsx"

In [294]:
import pandas as pd
import statistics
# import statsmodels
# from statsmodels.stats import inter_rater
# import krippendorff
import os
import ast
import numpy as np
import math

#### Load raw labels of all subgroups containing all individual annotations for agreement calculations

In [2]:
# os.chdir("C:/Users/admin/Dropbox/Master/SEDS/Masterarbeit/EMNLP Paper/Neural-Media-Bias-Detection-Using-Distant-Supervision-With-BABE/data")
MBIC_raw = pd.read_excel("data/raw_labels_MBIC.xlsx")
SG1_raw = pd.read_excel("data/raw_labels_SG1.xlsx")
SG2_raw = pd.read_excel("data/raw_labels_SG2.xlsx")

Some dataframes have multiple different annotator ids

In [118]:
SG2_raw.groupby('df_id')['annotator_id'].unique()

df_id
1          [3]
2          [9]
3       [2, 1]
4    [7, 6, 5]
5         [11]
Name: annotator_id, dtype: object

In [245]:
SG2_raw['annotator_id'].value_counts()

3     3674
9     3674
11    3666
2     2695
7     1692
5      999
6      983
1      979
Name: annotator_id, dtype: int64

Distribution of all bias and not bias annotations

In [156]:
SG2_raw['Label_bias_0-1'].value_counts(dropna=False)

0.0    9362
1.0    8999
NaN       1
Name: Label_bias_0-1, dtype: int64

In [195]:
SG2_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18362 entries, 0 to 18361
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   text            18361 non-null  object 
 1   news_link       18201 non-null  object 
 2   outlet          18362 non-null  object 
 3   topic           17481 non-null  object 
 4   type            13382 non-null  object 
 5   label_bias      18362 non-null  object 
 6   label_opinion   18362 non-null  object 
 7   biased_words    7814 non-null   object 
 8   annotator_id    18362 non-null  int64  
 9   Label_bias_0-1  18361 non-null  float64
 10  df_id           18362 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 1.5+ MB


In [101]:
grouped_by_sentence = SG2_raw.groupby('text')
grouped_by_biased_sentence = SG2_raw[SG2_raw['Label_bias_0-1'] == 1].groupby('text')
grouped_by_not_biased_sentence = SG2_raw[SG2_raw['Label_bias_0-1'] == 0].groupby('text')

In [102]:
grouped_by_sentence_describe = grouped_by_sentence.describe()
grouped_by_biased_sentence_describe = grouped_by_biased_sentence.describe()
grouped_by_not_biased_sentence_describe = grouped_by_not_biased_sentence.describe()

In [243]:
# Sanity check
print('Amount of sentences: ',grouped_by_sentence_describe.shape[0])
print('Amount of sentences that were annotated as biased once: ', grouped_by_biased_sentence_describe.shape[0])
print('Amount of sentences that were annotated as not biased once: ', grouped_by_not_biased_sentence_describe.shape[0])

Amount of sentences:  3699
Amount of sentences that were annotated as biased once:  2944
Amount of sentences that were annotated as not biased once:  2995


Sentences per annotation count

In [257]:
biased_and_not_biased_counts = pd.DataFrame({'biased': grouped_by_biased_sentence_describe['df_id']['count'], 
'not-biased': grouped_by_not_biased_sentence_describe['df_id']['count']}).value_counts(dropna=False)
biased_and_not_biased_counts

biased  not-biased
NaN     5.0           739
5.0     NaN           682
1.0     4.0           617
4.0     1.0           570
3.0     2.0           540
2.0     3.0           491
4.0     NaN            11
1.0     NaN             8
NaN     4.0             7
1.0     1.0             7
        2.0             6
NaN     1.0             6
2.0     1.0             3
        2.0             2
        NaN             2
3.0     1.0             2
1.0     3.0             2
NaN     2.0             2
3.0     NaN             1
NaN     3.0             1
dtype: int64

In [262]:
jprint("Sentences with majority vote", biased_and_not_biased_counts.where(biased_and_not_biased_counts>8).sum())

Sentences with majority vote 3650.0


In [318]:
test = grouped_by_sentence['biased_words'].apply(lambda x: len([z for z in list(set(x)) if type(z) == str]))
test.filter(test>4)

numpy.int64

In [92]:
SG2_bias = SG2_raw.pivot(index='df_id', columns='text', values='Label_bias_0-1')

In [3]:
####preprocess labels --> encode string to int labels for agreement calculations
MBIC_raw.replace(to_replace='Biased',value=1,inplace=True)
MBIC_raw.replace(to_replace='Non-biased',value=0,inplace=True)
MBIC_raw.replace(to_replace='Expresses writer’s opinion',value=2,inplace=True)
MBIC_raw.replace(to_replace='Somewhat factual but also opinionated',value=1,inplace=True)
MBIC_raw.replace(to_replace='Entirely factual',value=0,inplace=True)

SG1_raw.replace(to_replace="Expresses wleter´s opinion",value=2,inplace=True)
SG1_raw.replace(to_replace="Expresses writer’s opinion",value=2,inplace=True)
SG1_raw.replace(to_replace='Somewhat factional but also opinionated',value=1,inplace=True)
SG1_raw.replace(to_replace='Entirely factual',value=0,inplace=True)

SG2_raw.replace(to_replace='Expresses writer’s opinion',value=2,inplace=True)
SG2_raw.replace(to_replace='Somewhat factual but also opinionated',value=1,inplace=True)
SG2_raw.replace(to_replace='Entirely factual',value=0,inplace=True)

#### Bias agreement

In [4]:
#define pivot tables for bias agreement calculations
MBIC_bias = MBIC_raw.pivot(index='survey_record_id', columns='text', values='label_bias')
SG1_bias = SG1_raw.pivot(index='annotator_id', columns='text', values='Label_bias_0-1')
SG2_bias = SG2_raw.pivot(index='df_id', columns='text', values='Label_bias_0-1')

In [5]:
#calculate bias agreement
bias_alpha_MBIC = krippendorff.alpha(MBIC_bias)
print("Krippendorff's alpha for bias labels in MBIC = {}".format(round(bias_alpha_MBIC,2)))
bias_alpha_SG1 = krippendorff.alpha(SG1_bias)
print("Krippendorff's alpha for bias labels in SG1 = {}".format(round(bias_alpha_SG1,2)))
bias_alpha_SG2 = krippendorff.alpha(SG2_bias)
print("Krippendorff's alpha for bias labels in SG2 = {}".format(round(bias_alpha_SG2,2)))

Krippendorff's alpha for bias labels in MBIC = 0.21
Krippendorff's alpha for bias labels in SG1 = 0.39
Krippendorff's alpha for bias labels in SG2 = 0.4


#### Opinion agreement

In [6]:
# pivoting for opinion agreement calculation
MBIC_opin = MBIC_raw.pivot(index='survey_record_id', columns='text', values='label_opinion')
SG1_opin = SG1_raw.pivot(index='annotator_id', columns='text', values='label_opinion')
SG2_opin = SG2_raw.pivot(index = 'df_id', columns='text', values='label_opinion')

In [7]:
#calculate opinion agreement
opin_alpha_MBIC = krippendorff.alpha(MBIC_opin)
print("Krippendorff's alpha for opinion labels in MBIC = {}".format(round(opin_alpha_MBIC,2)))
opin_alpha_SG1 = krippendorff.alpha(SG1_opin)
print("Krippendorff's alpha for opinion labels in SG1 = {}".format(round(opin_alpha_SG1,2)))
opin_alpha_SG2 = krippendorff.alpha(SG2_opin)
print("Krippendorff's alpha for opinion labels in SG2 = {}".format(round(opin_alpha_SG2,2)))

Krippendorff's alpha for opinion labels in MBIC = 0.26
Krippendorff's alpha for opinion labels in SG1 = 0.46
Krippendorff's alpha for opinion labels in SG2 = 0.6


#### Load aggregated labels of all subgroups for calculation of descriptive statistics

In [8]:
MBIC  = pd.read_excel("final_labels_MBIC.xlsx")
SG1 =  pd.read_excel("final_labels_SG1.xlsx")
SG2 = pd.read_excel("final_labels_SG2.xlsx")

  warn(msg)


#### Number of Biased words per biased sentence

In [9]:
#MBIC
MBIC["biased_words"] = MBIC.biased_words.apply(lambda s: list(ast.literal_eval(s)))
MBIC['num_biased_words'] = MBIC.biased_words.apply(lambda row: len(row))
sent_with_biased_words = MBIC[MBIC['num_biased_words']>0]
print('MBIC: Average number of biased words in the biased sentences:', round(sent_with_biased_words.num_biased_words.mean(),2))

#SG1
SG1["biased_words"] = SG1.biased_words.apply(lambda s: list(ast.literal_eval(s)))
SG1['num_biased_words'] = SG1.biased_words.apply(lambda row: len(row))
sent_with_biased_words = SG1[SG1['num_biased_words']>0]
print('SG1: Average number of biased words in the biased sentences:', round(sent_with_biased_words.num_biased_words.mean(),2))

#SG2
SG2["biased_words"] = SG2.biased_words.apply(lambda s: list(ast.literal_eval(s)))
SG2['num_biased_words'] = SG2.biased_words.apply(lambda row: len(row))
sent_with_biased_words = SG2[SG2['num_biased_words']>0]
print('SG2: Average number of biased words in the biased sentences:', round(sent_with_biased_words.num_biased_words.mean(),2))

MBIC: Average number of biased words in the biased sentences: 2.4
SG1: Average number of biased words in the biased sentences: 1.95
SG2: Average number of biased words in the biased sentences: 2.11


#### Number of Total biased words

In [10]:
#count total numbers of words
sum_words_SG1 = 0 #MBIC and SG1 have the same number of biased words since they comprise identical sentences
sum_words_SG2 = 0

for sent in SG1['text']:
    sum_words_SG1 += len(sent.split())
for sent in SG2['text']:
    sum_words_SG2 += len(sent.split())

In [11]:
#MBIC
biased_words_sum_MBIC = MBIC['num_biased_words'].sum()
print ("{} out of {} words are labeled as biased in MBIC".format(biased_words_sum_MBIC,sum_words_SG1))

#SG1
biased_words_sum_SG1 = SG1['num_biased_words'].sum()
print ("{} out of {} words are labeled as biased in SG1".format(biased_words_sum_SG1,sum_words_SG1))

#SG2
biased_words_sum_SG2 = SG2['num_biased_words'].sum()
print ("{} out of {} words are labeled as biased in SG2".format(biased_words_sum_SG2,sum_words_SG2))

3283 out of 56826 words are labeled as biased in MBIC
1530 out of 56826 words are labeled as biased in SG1
3902 out of 116232 words are labeled as biased in SG2


#### Bias Label Distribution

In [12]:
#MBIC
bias_obs_MBIC = MBIC.groupby(['label_bias'])[['text']].count()
bias_obs_MBIC = bias_obs_MBIC.reset_index()
bias_obs_MBIC = bias_obs_MBIC.rename(columns={"text": "num_sentences"})
bias_obs_MBIC['sorting'] = [1, 3, 2]
bias_obs_MBIC = bias_obs_MBIC.sort_values(by=['sorting']).reset_index()
bias_obs_MBIC = bias_obs_MBIC[['label_bias','num_sentences']]
bias_obs_MBIC['percentage '] = bias_obs_MBIC['num_sentences'] / bias_obs_MBIC['num_sentences'].sum() * 100 # get percentage

#SG1
bias_obs_SG1 = SG1.groupby(['label_bias'])[['text']].count()
bias_obs_SG1 = bias_obs_SG1.reset_index()
bias_obs_SG1 = bias_obs_SG1.rename(columns={"text": "num_sentences"})
bias_obs_SG1['sorting'] = [1, 3, 2]
bias_obs_SG1 = bias_obs_SG1.sort_values(by=['sorting']).reset_index()
bias_obs_SG1 = bias_obs_SG1[['label_bias','num_sentences']]
bias_obs_SG1['percentage '] = bias_obs_SG1['num_sentences'] / bias_obs_SG1['num_sentences'].sum() * 100 #get percentage

#SG2
bias_obs_SG2 = SG2.groupby(['label_bias'])[['text']].count()
bias_obs_SG2 = bias_obs_SG2.reset_index()
bias_obs_SG2 = bias_obs_SG2.rename(columns={"text": "num_sentences"})
bias_obs_SG2['sorting'] = [1, 3, 2]
bias_obs_SG2 = bias_obs_SG2.sort_values(by=['sorting']).reset_index()
bias_obs_SG2 = bias_obs_SG2[['label_bias','num_sentences']]
bias_obs_SG2['percentage '] = bias_obs_SG2['num_sentences'] / bias_obs_SG2['num_sentences'].sum() * 100 #get percentage

print("MBIC Bias Label Distribution","\n",bias_obs_MBIC)
print("---------------------------------")
print("SG1 Bias Label Distribution","\n",bias_obs_SG1)
print("---------------------------------")
print("SG2 Bias Label Distribution","\n",bias_obs_SG2)

MBIC Bias Label Distribution 
      label_bias  num_sentences  percentage 
0        Biased           1018    59.882353
1    Non-biased            533    31.352941
2  No agreement            149     8.764706
---------------------------------
SG1 Bias Label Distribution 
      label_bias  num_sentences  percentage 
0        Biased            746    43.882353
1    Non-biased            800    47.058824
2  No agreement            154     9.058824
---------------------------------
SG2 Bias Label Distribution 
      label_bias  num_sentences  percentage 
0        Biased           1810    49.265106
1    Non-biased           1863    50.707676
2  No agreement              1     0.027218


#### Opinion Label Distribution

In [13]:
#MBIC
opin_obs_MBIC = MBIC.groupby(['label_opinion'])[['text']].count()
opin_obs_MBIC = opin_obs_MBIC.reset_index()
opin_obs_MBIC = opin_obs_MBIC.rename(columns={"text": "num_sentences"})
opin_obs_MBIC['sorting'] = [2, 1, 4,3]
opin_obs_MBIC = opin_obs_MBIC.sort_values(by=['sorting']).reset_index()
opin_obs_MBIC = opin_obs_MBIC[['label_opinion','num_sentences']]
opin_obs_MBIC = opin_obs_MBIC.replace('Entirely factual', 'Factual')
opin_obs_MBIC = opin_obs_MBIC.replace('Expresses writer’s opinion', 'Opinionated')
opin_obs_MBIC = opin_obs_MBIC.replace('Somewhat factual but also opinionated', 'Both')
opin_obs_MBIC['percentage '] = opin_obs_MBIC['num_sentences'] / opin_obs_MBIC['num_sentences'].sum() * 100 # get percentage

#SG1
opin_obs_SG1 = SG1.groupby(['label_opinion'])[['text']].count()
opin_obs_SG1 = opin_obs_SG1.reset_index()
opin_obs_SG1 = opin_obs_SG1.rename(columns={"text": "num_sentences"})
opin_obs_SG1['sorting'] = [2, 1, 4,3]
opin_obs_SG1 = opin_obs_SG1.sort_values(by=['sorting']).reset_index()
opin_obs_SG1 = opin_obs_SG1[['label_opinion','num_sentences']]
opin_obs_SG1 = opin_obs_SG1.replace('Entirely factual', 'Factual')
opin_obs_SG1 = opin_obs_SG1.replace('Expresses writer’s opinion', 'Opinionated')
opin_obs_SG1 = opin_obs_SG1.replace('Somewhat factual but also opinionated', 'Both')
opin_obs_SG1['percentage '] = opin_obs_SG1['num_sentences'] / opin_obs_SG1['num_sentences'].sum() * 100 # get percentage

#SG2
opin_obs_SG2 = SG2.groupby(['label_opinion'])[['text']].count()
opin_obs_SG2 = opin_obs_SG2.reset_index()
opin_obs_SG2 = opin_obs_SG2.rename(columns={"text": "num_sentences"})
opin_obs_SG2['sorting'] = [2,1, 4,3]
opin_obs_SG2 = opin_obs_SG2.sort_values(by=['sorting']).reset_index()
opin_obs_SG2 = opin_obs_SG2[['label_opinion','num_sentences']]
opin_obs_SG2 = opin_obs_SG2.replace('Entirely factual', 'Factual')
opin_obs_SG2 = opin_obs_SG2.replace('Expresses writer’s opinion', 'Opinionated')
opin_obs_SG2 = opin_obs_SG2.replace('Somewhat factual but also opinionated', 'Both')
opin_obs_SG2['percentage '] = opin_obs_SG2['num_sentences'] / opin_obs_SG2['num_sentences'].sum() * 100 # get percentage

print("MBIC Opinion Label Distribution","\n",opin_obs_MBIC)
print("---------------------------------")
print("SG1 Opinion Label Distribution","\n",opin_obs_SG1)
print("---------------------------------")
print("SG2 Opinion Label Distribution","\n",opin_obs_SG2)

MBIC Opinion Label Distribution 
   label_opinion  num_sentences  percentage 
0   Opinionated            521    30.647059
1       Factual            572    33.647059
2          Both            433    25.470588
3  No agreement            174    10.235294
---------------------------------
SG1 Opinion Label Distribution 
   label_opinion  num_sentences  percentage 
0   Opinionated            425    25.000000
1       Factual            639    37.588235
2          Both            453    26.647059
3  No agreement            183    10.764706
---------------------------------
SG2 Opinion Label Distribution 
   label_opinion  num_sentences  percentage 
0   Opinionated            858    23.353293
1       Factual           1600    43.549265
2          Both           1000    27.218291
3  No agreement            216     5.879151


#### Topic Distribution

In [14]:
#SG1
topic_obs_SG1 = SG1.groupby(['topic'])[['text']].count()
topic_obs_SG1 = topic_obs_SG1.reset_index()
topic_obs_SG1 = topic_obs_SG1.rename(columns={"text": "num_sentences"})
topic_obs_SG1['percentage'] = round(topic_obs_SG1['num_sentences'] / topic_obs_SG1['num_sentences'].sum() * 100,1) # get percentage

#SG2
topic_obs_SG2 = SG2.groupby(['topic'])[['text']].count()
topic_obs_SG2 = topic_obs_SG2.reset_index()
topic_obs_SG2 = topic_obs_SG2.rename(columns={"text": "num_sentences"})
topic_obs_SG2['percentage'] = round(topic_obs_SG2['num_sentences'] / topic_obs_SG2['num_sentences'].sum() * 100,1) # get percentage

print("SG1 Topic Distribution","\n",topic_obs_SG1)
print("---------------------------------------------------------------------")
print("SG2 Topic Distribution","\n",topic_obs_SG2)

SG1 Topic Distribution 
                                     topic  num_sentences  percentage
0                                abortion            126         7.4
1                             coronavirus            122         7.2
2                          elections-2020            111         6.5
3                             environment            135         7.9
4                                  gender            128         7.5
5                             gun-control            124         7.3
6                             immigration            122         7.2
7   international-politics-and-world-news             88         5.2
8                            middle-class            118         6.9
9                                   sport            126         7.4
10                           student-debt            127         7.5
11                       trump-presidency            120         7.1
12                               vaccines            122         7.2
13       

#### Bias per topic

In [15]:
#SG1
bias_topic_obs_SG1 = SG1.groupby(['topic','label_bias'])[['text']].count()
bias_topic_obs_SG1 = bias_topic_obs_SG1.reset_index()
bias_topic_obs_SG1 = bias_topic_obs_SG1.rename(columns={"text": "num_sentences"})

bias_topic_SG1 = bias_topic_obs_SG1.pivot(index='topic', columns='label_bias', values='num_sentences') #create pivot table
bias_topic_SG1['Total_annotations'] = bias_topic_SG1['Biased'] + bias_topic_SG1['Non-biased'] + bias_topic_SG1['No agreement']   #calculate total annotations
bias_topic_SG1['perc_biased'] = round((bias_topic_SG1['Biased'] / bias_topic_SG1['Total_annotations']) * 100,1)
bias_topic_SG1['perc_no_agr'] = round((bias_topic_SG1['No agreement'] / bias_topic_SG1['Total_annotations']) * 100,1)
bias_topic_SG1['perc_non_biased'] = round((bias_topic_SG1['Non-biased'] / bias_topic_SG1['Total_annotations']) * 100,1)

print("SG1 Bias per Topic Distribution")
bias_topic_SG1 = bias_topic_SG1[['Biased','No agreement','Non-biased','perc_biased','perc_no_agr','perc_non_biased']]
bias_topic_SG1

SG1 Bias per Topic Distribution


label_bias,Biased,No agreement,Non-biased,perc_biased,perc_no_agr,perc_non_biased
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
abortion,54,15,57,42.9,11.9,45.2
coronavirus,50,10,62,41.0,8.2,50.8
elections-2020,53,5,53,47.7,4.5,47.7
environment,57,16,62,42.2,11.9,45.9
gender,47,12,69,36.7,9.4,53.9
gun-control,50,19,55,40.3,15.3,44.4
immigration,57,7,58,46.7,5.7,47.5
international-politics-and-world-news,29,11,48,33.0,12.5,54.5
middle-class,50,12,56,42.4,10.2,47.5
sport,31,6,89,24.6,4.8,70.6


In [16]:
#SG2
bias_topic_obs_SG2 = SG2.groupby(['topic','label_bias'])[['text']].count()
bias_topic_obs_SG2 = bias_topic_obs_SG2.reset_index()
bias_topic_obs_SG2 = bias_topic_obs_SG2.rename(columns={"text": "num_sentences"})

bias_topic_SG2 = bias_topic_obs_SG2.pivot(index='topic', columns='label_bias', values='num_sentences') #create pivot table
bias_topic_SG2 = bias_topic_SG2[['Biased',"Non-biased"]]
bias_topic_SG2['Total_annotations'] = bias_topic_SG2['Biased'] + bias_topic_SG2['Non-biased'] #calculate total annotations
bias_topic_SG2['perc_biased'] = round((bias_topic_SG2['Biased'] / bias_topic_SG2['Total_annotations']) * 100,1)
bias_topic_SG2['perc_non_biased'] = round((bias_topic_SG2['Non-biased'] / bias_topic_SG2['Total_annotations']) * 100,1)

print("SG2 Bias per Topic Distribution")
bias_topic_SG2 = bias_topic_SG2[['Biased','Non-biased','perc_biased','perc_non_biased']]
bias_topic_SG2

SG2 Bias per Topic Distribution


label_bias,Biased,Non-biased,perc_biased,perc_non_biased
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
#metoo,1.0,28.0,3.4,96.6
abortion,64.0,62.0,50.8,49.2
black lives matter,125.0,164.0,43.3,56.7
blm,2.0,145.0,1.4,98.6
coronavirus,62.0,59.0,51.2,48.8
elections-2020,58.0,53.0,52.3,47.7
environment,63.0,72.0,46.7,53.3
gender,58.0,70.0,45.3,54.7
gun control,111.0,103.0,51.9,48.1
gun-control,60.0,60.0,50.0,50.0


####  Opinion per bias

In [17]:
#SG1
opin_bias_obs_SG1 = SG1.groupby(['label_bias','label_opinion'])[['text']].count()
opin_bias_obs_SG1 = opin_bias_obs_SG1.reset_index()
opin_bias_obs_SG1 = opin_bias_obs_SG1.rename(columns={"text": "num_sentences"})
opin_bias_obs_SG1['percentage'] = np.zeros(12)

#calculate percentage
sum_biased = opin_bias_obs_SG1[opin_bias_obs_SG1['label_bias']=='Biased']['num_sentences'].sum()
sum_no_agr = opin_bias_obs_SG1[opin_bias_obs_SG1['label_bias']=='No agreement']['num_sentences'].sum()
sum_non_biased = opin_bias_obs_SG1[opin_bias_obs_SG1['label_bias']=='Non-biased']['num_sentences'].sum()

for i in range(len(opin_bias_obs_SG1['percentage'])):
    if i <= 3:
        opin_bias_obs_SG1['percentage'].loc[i] = (opin_bias_obs_SG1['num_sentences'][i] / sum_biased) * 100 
    elif i > 3 and i <=7:
        opin_bias_obs_SG1['percentage'].loc[i] = (opin_bias_obs_SG1['num_sentences'][i] / sum_no_agr) * 100
    else:
        opin_bias_obs_SG1['percentage'].loc[i] = (opin_bias_obs_SG1['num_sentences'][i] / sum_non_biased) * 100
        
opin_bias_obs_SG1['percentage'] = round(opin_bias_obs_SG1['percentage'],1) #round

print("SG1 Opinion per Bias Distribution")
opin_bias_obs_SG1

SG1 Opinion per Bias Distribution


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,label_bias,label_opinion,num_sentences,percentage
0,Biased,Entirely factual,28,3.8
1,Biased,Expresses writer’s opinion,329,44.1
2,Biased,No agreement,76,10.2
3,Biased,Somewhat factual but also opinionated,313,42.0
4,No agreement,Entirely factual,47,30.5
5,No agreement,Expresses writer’s opinion,32,20.8
6,No agreement,No agreement,24,15.6
7,No agreement,Somewhat factual but also opinionated,51,33.1
8,Non-biased,Entirely factual,564,70.5
9,Non-biased,Expresses writer’s opinion,64,8.0


In [18]:
#SG2
opin_bias_obs_SG2 = SG2.groupby(['label_bias','label_opinion'])[['text']].count()
opin_bias_obs_SG2 = opin_bias_obs_SG2.reset_index()
opin_bias_obs_SG2 = opin_bias_obs_SG2.rename(columns={"text": "num_sentences"})
opin_bias_obs_SG2 = opin_bias_obs_SG2.drop([4])
opin_bias_obs_SG2 = opin_bias_obs_SG2.reset_index()
opin_bias_obs_SG2['percentage'] = np.zeros(8)

#calculate percentage
sum_biased = opin_bias_obs_SG2[opin_bias_obs_SG2['label_bias']=='Biased']['num_sentences'].sum()
sum_non_biased = opin_bias_obs_SG2[opin_bias_obs_SG2['label_bias']=='Non-biased']['num_sentences'].sum()

for i in range(len(opin_bias_obs_SG2['percentage'])):
    if i <= 3:
        opin_bias_obs_SG2['percentage'].loc[i] = (opin_bias_obs_SG2['num_sentences'][i] / sum_biased) * 100 
    else:
        opin_bias_obs_SG2['percentage'].loc[i] = (opin_bias_obs_SG2['num_sentences'][i] / sum_non_biased) * 100
        
opin_bias_obs_SG2['percentage'] = round(opin_bias_obs_SG2['percentage'],1) #round

print("SG2 Opinion per Bias Distribution")
opin_bias_obs_SG2

SG2 Opinion per Bias Distribution


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,index,label_bias,label_opinion,num_sentences,percentage
0,0,Biased,Entirely factual,119,6.6
1,1,Biased,Expresses writer’s opinion,758,41.9
2,2,Biased,No agreement,120,6.6
3,3,Biased,Somewhat factual but also opinionated,813,44.9
4,5,Non-biased,Entirely factual,1481,79.5
5,6,Non-biased,Expresses writer’s opinion,99,5.3
6,7,Non-biased,No agreement,96,5.2
7,8,Non-biased,Somewhat factual but also opinionated,187,10.0


####  Bias per opinion

In [19]:
#SG1
bias_opin_obs_SG1 = SG1.groupby(['label_opinion','label_bias'])[['text']].count()
bias_opin_obs_SG1 = bias_opin_obs_SG1.reset_index()
bias_opin_obs_SG1 = bias_opin_obs_SG1.rename(columns={"text": "num_sentences"})
bias_opin_obs_SG1['percentage'] = np.zeros(12)

#calculate percentage
sum_factual = bias_opin_obs_SG1[bias_opin_obs_SG1['label_opinion']=='Entirely factual']['num_sentences'].sum()
sum_opin = bias_opin_obs_SG1[bias_opin_obs_SG1['label_opinion']=='Expresses writer’s opinion']['num_sentences'].sum()
sum_no_agr = bias_opin_obs_SG1[bias_opin_obs_SG1['label_opinion']=='No agreement']['num_sentences'].sum()
sum_sw_factual = bias_opin_obs_SG1[bias_opin_obs_SG1['label_opinion']=='Somewhat factual but also opinionated']['num_sentences'].sum()

for i in range(len(bias_opin_obs_SG1['percentage'])):
    if i <= 2:
        bias_opin_obs_SG1['percentage'].loc[i] = (bias_opin_obs_SG1['num_sentences'][i] / sum_factual) * 100 
    elif i > 2 and i <=5:
        bias_opin_obs_SG1['percentage'].loc[i] = (bias_opin_obs_SG1['num_sentences'][i] / sum_opin) * 100
    elif i > 5 and i <= 8:
        bias_opin_obs_SG1['percentage'].loc[i] = (bias_opin_obs_SG1['num_sentences'][i] / sum_no_agr) * 100
    else:
        bias_opin_obs_SG1['percentage'].loc[i] = (bias_opin_obs_SG1['num_sentences'][i] / sum_sw_factual) * 100
        
bias_opin_obs_SG1['percentage'] = round(bias_opin_obs_SG1['percentage'],1) #round

print("SG1 Bias per Opinion Distribution")
bias_opin_obs_SG1

SG1 Bias per Opinion Distribution


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,label_opinion,label_bias,num_sentences,percentage
0,Entirely factual,Biased,28,4.4
1,Entirely factual,No agreement,47,7.4
2,Entirely factual,Non-biased,564,88.3
3,Expresses writer’s opinion,Biased,329,77.4
4,Expresses writer’s opinion,No agreement,32,7.5
5,Expresses writer’s opinion,Non-biased,64,15.1
6,No agreement,Biased,76,41.5
7,No agreement,No agreement,24,13.1
8,No agreement,Non-biased,83,45.4
9,Somewhat factual but also opinionated,Biased,313,69.1


In [20]:
#SG2
bias_opin_obs_SG2 = SG2.groupby(['label_opinion','label_bias'])[['text']].count()
bias_opin_obs_SG2 = bias_opin_obs_SG2.reset_index()
bias_opin_obs_SG2 = bias_opin_obs_SG2.rename(columns={"text": "num_sentences"})
bias_opin_obs_SG2 = bias_opin_obs_SG2.drop([3])
bias_opin_obs_SG2 = bias_opin_obs_SG2.reset_index()
bias_opin_obs_SG2['percentage'] = np.zeros(8)

#calculate percentage
sum_factual = bias_opin_obs_SG2[bias_opin_obs_SG2['label_opinion']=='Entirely factual']['num_sentences'].sum()
sum_opin = bias_opin_obs_SG2[bias_opin_obs_SG2['label_opinion']=='Expresses writer’s opinion']['num_sentences'].sum()
sum_no_agr = bias_opin_obs_SG2[bias_opin_obs_SG2['label_opinion']=='No agreement']['num_sentences'].sum()
sum_sw_factual = bias_opin_obs_SG2[bias_opin_obs_SG2['label_opinion']=='Somewhat factual but also opinionated']['num_sentences'].sum()

for i in range(len(bias_opin_obs_SG2['percentage'])):
    if i <= 1:
        bias_opin_obs_SG2['percentage'].loc[i] = (bias_opin_obs_SG2['num_sentences'][i] / sum_factual) * 100 
    elif i > 1 and i <=3:
        bias_opin_obs_SG2['percentage'].loc[i] = (bias_opin_obs_SG2['num_sentences'][i] / sum_opin) * 100
    elif i > 3 and i <= 5:
        bias_opin_obs_SG2['percentage'].loc[i] = (bias_opin_obs_SG2['num_sentences'][i] / sum_no_agr) * 100
    else:
        bias_opin_obs_SG2['percentage'].loc[i] = (bias_opin_obs_SG2['num_sentences'][i] / sum_sw_factual) * 100
        
bias_opin_obs_SG2['percentage'] = round(bias_opin_obs_SG2['percentage'],1) #round

print("SG1 Bias per Opinion Distribution")
bias_opin_obs_SG2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


SG1 Bias per Opinion Distribution


Unnamed: 0,index,label_opinion,label_bias,num_sentences,percentage
0,0,Entirely factual,Biased,119,7.4
1,1,Entirely factual,Non-biased,1481,92.6
2,2,Expresses writer’s opinion,Biased,758,88.4
3,4,Expresses writer’s opinion,Non-biased,99,11.6
4,5,No agreement,Biased,120,55.6
5,6,No agreement,Non-biased,96,44.4
6,7,Somewhat factual but also opinionated,Biased,813,81.3
7,8,Somewhat factual but also opinionated,Non-biased,187,18.7


#### Ideology Distribution

In [21]:
#SG1
ideology_obs_SG1 = SG1.groupby(['type'])[['text']].count()
ideology_obs_SG1 = ideology_obs_SG1.reset_index()
ideology_obs_SG1 = ideology_obs_SG1.rename(columns={"text": "num_sentences"})
ideology_obs_SG1['percentage'] = round(ideology_obs_SG1['num_sentences'] / ideology_obs_SG1['num_sentences'].sum() * 100,1) # get percentage

#SG2
ideology_obs_SG2 = SG2.groupby(['type'])[['text']].count()
ideology_obs_SG2 = ideology_obs_SG2.reset_index()
ideology_obs_SG2 = ideology_obs_SG2.rename(columns={"text": "num_sentences"})
ideology_obs_SG2['percentage'] = round(ideology_obs_SG2['num_sentences'] / ideology_obs_SG2['num_sentences'].sum() * 100,1) # get percentage

print("SG1 Ideology Distribution","\n",ideology_obs_SG1)
print("------------------------------------")
print("SG2 Ideology Distribution","\n",ideology_obs_SG2)

SG1 Ideology Distribution 
      type  num_sentences  percentage
0  center            315        18.5
1    left            694        40.8
2   right            691        40.6
------------------------------------
SG2 Ideology Distribution 
      type  num_sentences  percentage
0  center            692        25.9
1    left            989        37.0
2   right            993        37.1


#### Bias per Ideology 

In [22]:
#SG1
bias_ideo_obs_SG1 = SG1.groupby(['type','label_bias'])[['text']].count()
bias_ideo_obs_SG1 = bias_ideo_obs_SG1.reset_index()
bias_ideo_obs_SG1 = bias_ideo_obs_SG1.rename(columns={"text": "num_sentences"})
bias_ideo_obs_SG1['percentage'] = np.zeros(9)

#calculate percentage per ideology
sum_center = bias_ideo_obs_SG1[bias_ideo_obs_SG1['type']=='center']['num_sentences'].sum()
sum_left = bias_ideo_obs_SG1[bias_ideo_obs_SG1['type']=='left']['num_sentences'].sum()
sum_right = bias_ideo_obs_SG1[bias_ideo_obs_SG1['type']=='right']['num_sentences'].sum()

for i in range(len(bias_ideo_obs_SG1['percentage'])):
    if i <= 2:
        bias_ideo_obs_SG1['percentage'].loc[i] = (bias_ideo_obs_SG1['num_sentences'][i] / sum_center) * 100 
    elif i > 2 and i <=5:
        bias_ideo_obs_SG1['percentage'].loc[i] = (bias_ideo_obs_SG1['num_sentences'][i] / sum_left) * 100
    else:
        bias_ideo_obs_SG1['percentage'].loc[i] = (bias_ideo_obs_SG1['num_sentences'][i] / sum_right) * 100
        
bias_ideo_obs_SG1['percentage'] = round(bias_ideo_obs_SG1['percentage'],1) #round
        

#SG2
bias_ideo_obs_SG2 = SG2.groupby(['type','label_bias'])[['text']].count()
bias_ideo_obs_SG2 = bias_ideo_obs_SG2.reset_index()
bias_ideo_obs_SG2 = bias_ideo_obs_SG2.rename(columns={"text": "num_sentences"})
bias_ideo_obs_SG2['percentage'] = np.zeros(7)

#calculate percentage per ideology
sum_center = bias_ideo_obs_SG2[bias_ideo_obs_SG2['type']=='center']['num_sentences'].sum()
sum_left = bias_ideo_obs_SG2[bias_ideo_obs_SG2['type']=='left']['num_sentences'].sum()
sum_right = bias_ideo_obs_SG2[bias_ideo_obs_SG2['type']=='right']['num_sentences'].sum()

for i in range(len(bias_ideo_obs_SG2['percentage'])):
    if i <= 1:
        bias_ideo_obs_SG2['percentage'].loc[i] = (bias_ideo_obs_SG2['num_sentences'][i] / sum_center) * 100 
    elif i > 1 and i <=3:
        bias_ideo_obs_SG2['percentage'].loc[i] = (bias_ideo_obs_SG2['num_sentences'][i] / sum_left) * 100
    else:
        bias_ideo_obs_SG2['percentage'].loc[i] = (bias_ideo_obs_SG2['num_sentences'][i] / sum_right) * 100
        
bias_ideo_obs_SG2['percentage'] = round(bias_ideo_obs_SG2['percentage'],1) #round
        
print("SG1 Bias per Ideology Distribution","\n",bias_ideo_obs_SG1)
print("------------------------------------")
print("SG2 Bias per Ideology Distribution","\n",bias_ideo_obs_SG2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


SG1 Bias per Ideology Distribution 
      type    label_bias  num_sentences  percentage
0  center        Biased             38        12.1
1  center  No agreement             16         5.1
2  center    Non-biased            261        82.9
3    left        Biased            331        47.7
4    left  No agreement             67         9.7
5    left    Non-biased            296        42.7
6   right        Biased            377        54.6
7   right  No agreement             71        10.3
8   right    Non-biased            243        35.2
------------------------------------
SG2 Bias per Ideology Distribution 
      type    label_bias  num_sentences  percentage
0  center        Biased             99        14.3
1  center    Non-biased            593        85.7
2    left        Biased            618        62.5
3    left    Non-biased            371        37.5
4   right        Biased            597        60.1
5   right  No agreement              1         0.1
6   right    Non-biase

#### Opinion per ideology

In [23]:
#SG1
opin_ideo_obs_SG1 = SG1.groupby(['type','label_opinion'])[['text']].count()
opin_ideo_obs_SG1 = opin_ideo_obs_SG1.reset_index()
opin_ideo_obs_SG1 = opin_ideo_obs_SG1.rename(columns={"text": "num_sentences"})
opin_ideo_obs_SG1['percentage'] = np.zeros(12)

#calculate percentage per ideology
sum_center = opin_ideo_obs_SG1[opin_ideo_obs_SG1['type']=='center']['num_sentences'].sum()
sum_left = opin_ideo_obs_SG1[opin_ideo_obs_SG1['type']=='left']['num_sentences'].sum()
sum_right = opin_ideo_obs_SG1[opin_ideo_obs_SG1['type']=='right']['num_sentences'].sum()

for i in range(len(opin_ideo_obs_SG1['percentage'])):
    if i <= 3:
        opin_ideo_obs_SG1['percentage'].loc[i] = (opin_ideo_obs_SG1['num_sentences'][i] / sum_center) * 100 
    elif i > 3 and i <=7:
        opin_ideo_obs_SG1['percentage'].loc[i] = (opin_ideo_obs_SG1['num_sentences'][i] / sum_left) * 100
    else:
        opin_ideo_obs_SG1['percentage'].loc[i] = (opin_ideo_obs_SG1['num_sentences'][i] / sum_right) * 100
        
opin_ideo_obs_SG1['percentage'] = round(opin_ideo_obs_SG1['percentage'],1) #round
        

#SG2
opin_ideo_obs_SG2 = SG2.groupby(['type','label_opinion'])[['text']].count()
opin_ideo_obs_SG2 = opin_ideo_obs_SG2.reset_index()
opin_ideo_obs_SG2 = opin_ideo_obs_SG2.rename(columns={"text": "num_sentences"})
opin_ideo_obs_SG2['percentage'] = np.zeros(12)

#calculate percentage per ideology
sum_center = opin_ideo_obs_SG2[opin_ideo_obs_SG2['type']=='center']['num_sentences'].sum()
sum_left = opin_ideo_obs_SG2[opin_ideo_obs_SG2['type']=='left']['num_sentences'].sum()
sum_right = opin_ideo_obs_SG2[opin_ideo_obs_SG2['type']=='right']['num_sentences'].sum()

for i in range(len(opin_ideo_obs_SG2['percentage'])):
    if i <= 3:
        opin_ideo_obs_SG2['percentage'].loc[i] = (opin_ideo_obs_SG2['num_sentences'][i] / sum_center) * 100 
    elif i > 3 and i <=7:
        opin_ideo_obs_SG2['percentage'].loc[i] = (opin_ideo_obs_SG2['num_sentences'][i] / sum_left) * 100
    else:
        opin_ideo_obs_SG2['percentage'].loc[i] = (opin_ideo_obs_SG2['num_sentences'][i] / sum_right) * 100
        
opin_ideo_obs_SG2['percentage'] = round(opin_ideo_obs_SG2['percentage'],1) #round
        
print("SG1 Opinion per Ideology Distribution","\n",opin_ideo_obs_SG1)
print("------------------------------------")
print("SG2 Opinion per Ideology Distribution","\n",opin_ideo_obs_SG2)

SG1 Opinion per Ideology Distribution

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


 
       type                          label_opinion  num_sentences  percentage
0   center                       Entirely factual            241        76.5
1   center             Expresses writer’s opinion              9         2.9
2   center                           No agreement             29         9.2
3   center  Somewhat factual but also opinionated             36        11.4
4     left                       Entirely factual            202        29.1
5     left             Expresses writer’s opinion            211        30.4
6     left                           No agreement             75        10.8
7     left  Somewhat factual but also opinionated            206        29.7
8    right                       Entirely factual            196        28.4
9    right             Expresses writer’s opinion            205        29.7
10   right                           No agreement             79        11.4
11   right  Somewhat factual but also opinionated            211        3