# Hypothesis testing: readability

In this Notebook, the readability data is explored and the related hypotheses are tested.

## Loading packages and data

In [1]:
#import the necessary packages
import pandas as pd
from pandas import read_excel
from scipy import stats
from scipy.stats import mannwhitneyu

In [2]:
#read and inspect the data
df = read_excel("complete_data_cleaned_with_emotionality.xlsx")
df.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,Newspaper,Date,Length,Category,Author,Headline,Teaser,...,negativity ratio,positivity ratio,emotionality,emotionality ratio,positive words rauh,negative words rauh,emotionality rauh,emotionality ratio rauh,negativity ratio rauh,positivity ratio rauh
0,0,6,100006,sueddeutschet politik (www),2020-05-28T15:34:08,367,,,SZ Espresso: Nachrichten kompakt - die Übersic...,<p>Was heute wichtig war - und was Sie auf SZ....,...,0.026786,0.035714,14,0.0625,16,16,32,0.142857,0.071429,0.071429
1,1,8,100008,sueddeutschet politik (www),2020-05-28T17:01:43,200,,,Kommunalpolitik: Abgeblendet,<p>Bayreuths Stadtrat im Stream</p>,...,0.0,0.019231,2,0.019231,3,3,6,0.057692,0.028846,0.028846
2,2,24,100024,aachener zeitung (www),2020-05-28T03:01:52,512,Politik,,Länder planen Öffnung: Streit über Schulen und...,"<img src=""https://www.aachener-zeitung.de/imgs...",...,0.018868,0.034591,17,0.053459,23,18,41,0.128931,0.056604,0.072327


## Select relevant columns

In [3]:
df = df[["ID", "Newspaper", "Length", "Article", "modality_dummy", "reach_dummy", "clean text", "emotionality ratio", "emotionality ratio rauh", "negativity ratio", "negativity ratio rauh"]]
len(df)

11491

## Descriptives

In [4]:
print("Overall sample - Emotionality")
print("SentiWS")
print(df["emotionality ratio"].mean())
print(df["emotionality ratio"].std())
print("Rauh sentiment dictionary")
print(df["emotionality ratio rauh"].mean())
print(df["emotionality ratio rauh"].std())

Overall sample - Emotionality
SentiWS
0.03816306678421195
0.017348136821103165
Rauh sentiment dictionary
0.11395210477368804
0.033397475657465814


In [5]:
print("Overall sample - Negativity")
print("SentiWS")
print(df["negativity ratio"].mean())
print(df["negativity ratio"].std())
print("Rauh sentiment dictionary")
print(df["negativity ratio rauh"].mean())
print(df["negativity ratio rauh"].std())

Overall sample - Negativity
SentiWS
0.010094252422380436
0.008762600909030705
Rauh sentiment dictionary
0.048102459982067365
0.02346699736023017


In [6]:
print("Senti WS")
print("Print sample - Emotionality")
print(df[df["modality_dummy"] == 1]["emotionality ratio"].mean())
print(df[df["modality_dummy"] == 1]["emotionality ratio"].std())
print("")
print("Online sample - Emotionality")
print(df[df["modality_dummy"] == 0]["emotionality ratio"].mean())
print(df[df["modality_dummy"] == 0]["emotionality ratio"].std())
print("")
print("")
print("")
print("Rauh sentiment dictionary")
print("Print sample - Emotionality")
print(df[df["modality_dummy"] == 1]["emotionality ratio rauh"].mean())
print(df[df["modality_dummy"] == 1]["emotionality ratio rauh"].std())
print("")
print("Online sample - Emotionality")
print(df[df["modality_dummy"] == 0]["emotionality ratio rauh"].mean())
print(df[df["modality_dummy"] == 0]["emotionality ratio rauh"].std())

Senti WS
Print sample - Emotionality
0.038155349337889215
0.017433818785102355

Online sample - Emotionality
0.038238074315215295
0.016499797247167616



Rauh sentiment dictionary
Print sample - Emotionality
0.11338865149003569
0.033440386007394726

Online sample - Emotionality
0.1194284291788896
0.03248734756265601


In [7]:
print("Senti WS")
print("Print sample - Negativity")
print(df[df["modality_dummy"] == 1]["negativity ratio"].mean())
print(df[df["modality_dummy"] == 1]["negativity ratio"].std())
print("")
print("Online sample - Negativity")
print(df[df["modality_dummy"] == 0]["negativity ratio"].mean())
print(df[df["modality_dummy"] == 0]["negativity ratio"].std())
print("")
print("")
print("")
print("Rauh sentiment dictionary")
print("Print sample - Negativity")
print(df[df["modality_dummy"] == 1]["negativity ratio rauh"].mean())
print(df[df["modality_dummy"] == 1]["negativity ratio rauh"].std())
print("")
print("Online sample - Negativity")
print(df[df["modality_dummy"] == 0]["negativity ratio rauh"].mean())
print(df[df["modality_dummy"] == 0]["negativity ratio rauh"].std())

Senti WS
Print sample - Negativity
0.00990631817648763
0.008617096062058438

Online sample - Negativity
0.011920826030549659
0.009887902722988565



Rauh sentiment dictionary
Print sample - Negativity
0.04707601470545181
0.02290121608370302

Online sample - Negativity
0.05807870376663785
0.026391973342627467


In [8]:
print("Senti WS")
print("National sample - Emotionality")
print(df[df["reach_dummy"] == 1]["emotionality ratio"].mean())
print(df[df["reach_dummy"] == 1]["emotionality ratio"].std())
print("")
print("Regional sample - Emotionality")
print(df[df["reach_dummy"] == 0]["emotionality ratio"].mean())
print(df[df["reach_dummy"] == 0]["emotionality ratio"].std())
print("")
print("")
print("")
print("Rauh sentiment dictionary - Emotionality")
print("National sample")
print(df[df["reach_dummy"] == 1]["emotionality ratio rauh"].mean())
print(df[df["reach_dummy"] == 1]["emotionality ratio rauh"].std())
print("")
print("Regional sample - Emotionality")
print(df[df["reach_dummy"] == 0]["emotionality ratio rauh"].mean())
print(df[df["reach_dummy"] == 0]["emotionality ratio rauh"].std())

Senti WS
National sample - Emotionality
0.039286026867002714
0.016467556741805188

Regional sample - Emotionality
0.03672470604299515
0.01831598487004006



Rauh sentiment dictionary - Emotionality
National sample
0.11757758137699889
0.03130640485958161

Regional sample - Emotionality
0.1093083571513851
0.03536260946376782


In [9]:
print("Senti WS")
print("National sample - Negativity")
print(df[df["reach_dummy"] == 1]["negativity ratio"].mean())
print(df[df["reach_dummy"] == 1]["negativity ratio"].std())
print("")
print("Regional sample - Negativity")
print(df[df["reach_dummy"] == 0]["negativity ratio"].mean())
print(df[df["reach_dummy"] == 0]["negativity ratio"].std())
print("")
print("")
print("")
print("Rauh sentiment dictionary")
print("National sample - Negativity")
print(df[df["reach_dummy"] == 1]["negativity ratio rauh"].mean())
print(df[df["reach_dummy"] == 1]["negativity ratio rauh"].std())
print("")
print("Regional sample - Negativity")
print(df[df["reach_dummy"] == 0]["negativity ratio rauh"].mean())
print(df[df["reach_dummy"] == 0]["negativity ratio rauh"].std())

Senti WS
National sample - Negativity
0.010937098375036921
0.008382042200687934

Regional sample - Negativity
0.009014680184886886
0.009114825441488452



Rauh sentiment dictionary
National sample - Negativity
0.05113729103685183
0.022394217951206163

Regional sample - Negativity
0.04421524981999458
0.024226360121676626


## Hypothesis test

In [9]:
print("RQ3a: Reach")
print("SW")
print(mannwhitneyu(df[df["reach_dummy"] == 1]["emotionality ratio"], df[df["reach_dummy"] == 0]["emotionality ratio"]))
print("Rauh")
print(mannwhitneyu(df[df["reach_dummy"] == 1]["emotionality ratio rauh"], df[df["reach_dummy"] == 0]["emotionality ratio rauh"]))

RQ3a: Reach
SW
MannwhitneyuResult(statistic=14687502.0, pvalue=3.2141584315642844e-19)
Rauh
MannwhitneyuResult(statistic=13962818.0, pvalue=6.85376734989358e-39)


In [10]:
print("RQ3b: Modality")
print("SW")
print(mannwhitneyu(df[df["modality_dummy"] == 1]["emotionality ratio"], df[df["modality_dummy"] == 0]["emotionality ratio"]))
print("Rauh")
print(mannwhitneyu(df[df["modality_dummy"] == 1]["emotionality ratio rauh"], df[df["reach_dummy"] == 0]["emotionality ratio rauh"]))

RQ3b: Modality
SW
MannwhitneyuResult(statistic=5569962.5, pvalue=0.44378760133407724)
Rauh
MannwhitneyuResult(statistic=24380756.5, pvalue=3.721401299563576e-13)


## Comparisons

In [11]:
#grouping by outlet
df.groupby("Newspaper").mean()

Unnamed: 0_level_0,ID,Length,modality_dummy,reach_dummy,emotionality ratio,emotionality ratio rauh,negativity ratio,negativity ratio rauh
Newspaper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Aachener Zeitung,493.615464,484.008247,1.0,0.0,0.03637,0.107716,0.009237,0.044877
Der Tagesspiegel,10928.010886,574.946345,1.0,1.0,0.038179,0.116717,0.010418,0.050711
Die Welt,1408.464501,774.438026,1.0,1.0,0.041671,0.125908,0.011864,0.056057
Rheinische Post,3487.029474,377.456,1.0,0.0,0.036143,0.105901,0.007886,0.039766
Stuttgarter Zeitung,5750.415521,394.241714,1.0,0.0,0.037839,0.11409,0.010065,0.047511
Süddeutsche Zeitung (inkl. Regionalausgaben),8342.211828,529.366129,1.0,1.0,0.039217,0.115467,0.010704,0.048909
aachener zeitung (www),104215.535714,401.488095,0.0,0.0,0.03829,0.118968,0.011044,0.056615
der tagesspiegel (www),104147.094697,574.189394,0.0,1.0,0.038643,0.123896,0.01272,0.060853
die welt (www),103980.39548,578.751412,0.0,1.0,0.038298,0.118053,0.011909,0.057265
rheinische post (www),104260.479769,338.699422,0.0,0.0,0.035807,0.116459,0.011159,0.058407


In [12]:
#grouping by outlet
df.groupby("Newspaper").std()

Unnamed: 0_level_0,ID,Length,modality_dummy,reach_dummy,emotionality ratio,emotionality ratio rauh,negativity ratio,negativity ratio rauh
Newspaper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Aachener Zeitung,286.104359,2168.385235,0.0,0.0,0.019341,0.037708,0.009668,0.024386
Der Tagesspiegel,375.730454,360.531429,0.0,0.0,0.015311,0.030361,0.007837,0.020901
Die Welt,240.577279,390.822211,0.0,0.0,0.015253,0.027633,0.007425,0.018575
Rheinische Post,945.658182,240.238762,0.0,0.0,0.018087,0.033791,0.00809,0.021803
Stuttgarter Zeitung,373.245187,246.07036,0.0,0.0,0.018329,0.036166,0.009233,0.024588
Süddeutsche Zeitung (inkl. Regionalausgaben),1120.275724,367.377709,0.0,0.0,0.017123,0.031969,0.008701,0.022815
aachener zeitung (www),2451.087736,197.696919,0.0,0.0,0.01406,0.029576,0.009007,0.025366
der tagesspiegel (www),2213.800853,347.385381,0.0,0.0,0.015022,0.032417,0.008832,0.025486
die welt (www),2289.970877,362.892006,0.0,0.0,0.014534,0.030067,0.007514,0.022622
rheinische post (www),2246.251336,208.061602,0.0,0.0,0.016996,0.034214,0.009573,0.029803


In [13]:
print("AZ")
print(df[(df["Newspaper"]== "Aachener Zeitung") | (df["Newspaper"]== "aachener zeitung (www)")]["emotionality ratio rauh"].mean())
print(df[(df["Newspaper"]== "Aachener Zeitung") | (df["Newspaper"]== "aachener zeitung (www)")]["emotionality ratio rauh"].std())
print("RP")
print(df[(df["Newspaper"]== "Rheinische Post") | (df["Newspaper"]== "rheinische post (www)")]["emotionality ratio rauh"].mean())
print(df[(df["Newspaper"]== "Rheinische Post") | (df["Newspaper"]== "rheinische post (www)")]["emotionality ratio rauh"].std())
print("STZ")
print(df[(df["Newspaper"]== "Stuttgarter Zeitung") | (df["Newspaper"]== "stuttgarter zeitung (www)")]["emotionality ratio rauh"].mean())
print(df[(df["Newspaper"]== "Stuttgarter Zeitung") | (df["Newspaper"]== "stuttgarter zeitung (www)")]["emotionality ratio rauh"].std())
print("Welt")
print(df[(df["Newspaper"]== "Die Welt") | (df["Newspaper"]== "die welt (www)")]["emotionality ratio rauh"].mean())
print(df[(df["Newspaper"]== "Die Welt") | (df["Newspaper"]== "die welt (www)")]["emotionality ratio rauh"].std())
print("TS")
print(df[(df["Newspaper"]== "Der Tagesspiegel") | (df["Newspaper"]== "der tagesspiegel (www)")]["emotionality ratio rauh"].mean())
print(df[(df["Newspaper"]== "Der Tagesspiegel") | (df["Newspaper"]== "der tagesspiegel (www)")]["emotionality ratio rauh"].std())
print("SZ")
print(df[(df["Newspaper"]== "Süddeutsche Zeitung (inkl. Regionalausgaben)") | (df["Newspaper"]== "sueddeutschet politik (www)")]["emotionality ratio rauh"].mean())
print(df[(df["Newspaper"]== "Süddeutsche Zeitung (inkl. Regionalausgaben)") | (df["Newspaper"]== "sueddeutschet politik (www)")]["emotionality ratio rauh"].std())

AZ
0.10937714659534122
0.03682729848487126
RP
0.10661816302209487
0.03391730808093656
STZ
0.114320437221068
0.036240284504076396
Welt
0.12452883456962895
0.02821856468378349
TS
0.1179396946518033
0.030828871617872736
SZ
0.11563454189198727
0.031993640142326536


In [14]:
print("AZ")
print(df[(df["Newspaper"]== "Aachener Zeitung") | (df["Newspaper"]== "aachener zeitung (www)")]["negativity ratio rauh"].mean())
print(df[(df["Newspaper"]== "Aachener Zeitung") | (df["Newspaper"]== "aachener zeitung (www)")]["negativity ratio rauh"].std())
print("RP")
print(df[(df["Newspaper"]== "Rheinische Post") | (df["Newspaper"]== "rheinische post (www)")]["negativity ratio rauh"].mean())
print(df[(df["Newspaper"]== "Rheinische Post") | (df["Newspaper"]== "rheinische post (www)")]["negativity ratio rauh"].std())
print("STZ")
print(df[(df["Newspaper"]== "Stuttgarter Zeitung") | (df["Newspaper"]== "stuttgarter zeitung (www)")]["negativity ratio rauh"].mean())
print(df[(df["Newspaper"]== "Stuttgarter Zeitung") | (df["Newspaper"]== "stuttgarter zeitung (www)")]["negativity ratio rauh"].std())
print("Welt")
print(df[(df["Newspaper"]== "Die Welt") | (df["Newspaper"]== "die welt (www)")]["negativity ratio rauh"].mean())
print(df[(df["Newspaper"]== "Die Welt") | (df["Newspaper"]== "die welt (www)")]["negativity ratio rauh"].std())
print("TS")
print(df[(df["Newspaper"]== "Der Tagesspiegel") | (df["Newspaper"]== "der tagesspiegel (www)")]["negativity ratio rauh"].mean())
print(df[(df["Newspaper"]== "Der Tagesspiegel") | (df["Newspaper"]== "der tagesspiegel (www)")]["negativity ratio rauh"].std())
print("SZ")
print(df[(df["Newspaper"]== "Süddeutsche Zeitung (inkl. Regionalausgaben)") | (df["Newspaper"]== "sueddeutschet politik (www)")]["negativity ratio rauh"].mean())
print(df[(df["Newspaper"]== "Süddeutsche Zeitung (inkl. Regionalausgaben)") | (df["Newspaper"]== "sueddeutschet politik (www)")]["negativity ratio rauh"].std())

AZ
0.04661021979270045
0.024873260468753506
RP
0.04103174590734314
0.022914136495213823
STZ
0.048199045781900346
0.025255626379035157
Welt
0.05626942568433477
0.019340351365159113
TS
0.052438184500854766
0.02207321768378979
SZ
0.04929144338759184
0.02301078827246447
