# Hypothesis testing: readability

In this Notebook, the readability data is explored and the related hypotheses are (partially) tested.

## Loading packages and data

In [1]:
#import the necessary packages
import pandas as pd
from pandas import read_excel
from scipy import stats
from scipy.stats import mannwhitneyu

In [2]:
#read and inspect the data
df = read_excel("complete_data_cleaned_with_actor_diversity.xlsx")
df.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,Newspaper,Date,Length,Category,Author,Headline,Teaser,...,person dummy,organisation dummy,nea dummy,noa dummy,rea dummy,roa dummy,ea dummy,oa dummy,diversity index,diversity index all actors
0,0,6,100006,sueddeutschet politik (www),2020-05-28T15:34:08,367,,,SZ Espresso: Nachrichten kompakt - die Übersic...,<p>Was heute wichtig war - und was Sie auf SZ....,...,1,1,0,0,0,0,0,0,2,2
1,1,8,100008,sueddeutschet politik (www),2020-05-28T17:01:43,200,,,Kommunalpolitik: Abgeblendet,<p>Bayreuths Stadtrat im Stream</p>,...,1,0,0,0,0,0,0,0,1,1
2,2,24,100024,aachener zeitung (www),2020-05-28T03:01:52,512,Politik,,Länder planen Öffnung: Streit über Schulen und...,"<img src=""https://www.aachener-zeitung.de/imgs...",...,1,1,1,1,0,0,1,1,4,4


In [3]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'ID', 'Newspaper', 'Date', 'Length',
       'Category', 'Author', 'Headline', 'Teaser', 'Article', 'Modality',
       'url', 'clean text', 'words in clean text', 'reach_dummy',
       'modality_dummy', 'persons', 'organisations', 'national_elite_actors',
       'national_opposition_actors', 'regional_elite_actors',
       'regional_opposition_actors', 'person dummy', 'organisation dummy',
       'nea dummy', 'noa dummy', 'rea dummy', 'roa dummy', 'ea dummy',
       'oa dummy', 'diversity index', 'diversity index all actors'],
      dtype='object')

## Select relevant columns

In [4]:
df = df[["ID", "Newspaper", "Length", "Article", "modality_dummy", "reach_dummy", "clean text", 
         "persons", "organisations", "national_elite_actors", "national_opposition_actors", "regional_elite_actors", 
         "regional_opposition_actors", "person dummy", "organisation dummy","nea dummy", "noa dummy", "rea dummy",
         "roa dummy", "diversity index", "ea dummy", "oa dummy", "diversity index all actors"]]
len(df)

11491

## Descriptives

In [5]:
print("Overall sample")
print(df["diversity index"].mean())
print(df["diversity index"].std())

Overall sample
2.521016447654686
0.7965151758779676


In [6]:
print("Print sample")
print(df[df["modality_dummy"] == 1]["diversity index"].mean())
print(df[df["modality_dummy"] == 1]["diversity index"].std())
print("")
print("Online sample")
print(df[df["modality_dummy"] == 0]["diversity index"].mean())
print(df[df["modality_dummy"] == 0]["diversity index"].std())

Print sample
2.546693540646895
0.7874540330357805

Online sample
2.271455223880597
0.8401716788295912


In [7]:
print("National sample")
print(df[df["reach_dummy"] == 1]["diversity index"].mean())
print(df[df["reach_dummy"] == 1]["diversity index"].std())
print("")
print("Regional sample")
print(df[df["reach_dummy"] == 0]["diversity index"].mean())
print(df[df["reach_dummy"] == 0]["diversity index"].std())

National sample
2.520688052068805
0.7636430274953114

Regional sample
2.5214370782056372
0.8368124470758407


## Hypothesis test

In [10]:
print("RQ3a: Reach")
print(mannwhitneyu(df[df["reach_dummy"] == 1]["diversity index"], df[df["reach_dummy"] == 0]["diversity index"]))

RQ3a: Reach
MannwhitneyuResult(statistic=16119588.5, pvalue=0.20372728542817847)


In [11]:
print("RQ3b: Moality")
print(mannwhitneyu(df[df["modality_dummy"] == 1]["diversity index"], df[df["modality_dummy"] == 0]["diversity index"]))

RQ3b: Moality
MannwhitneyuResult(statistic=4566643.5, pvalue=1.257397869946061e-26)


## Comparisons

In [12]:
#grouping by outlet
df[["Newspaper", "diversity index","Length"]].groupby("Newspaper").mean()

Unnamed: 0_level_0,diversity index,Length
Newspaper,Unnamed: 1_level_1,Unnamed: 2_level_1
Aachener Zeitung,2.528866,484.008247
Der Tagesspiegel,2.656299,574.946345
Die Welt,2.625752,774.438026
Rheinische Post,2.541053,377.456
Stuttgarter Zeitung,2.598222,394.241714
Süddeutsche Zeitung (inkl. Regionalausgaben),2.482258,529.366129
aachener zeitung (www),2.428571,401.488095
der tagesspiegel (www),2.348485,574.189394
die welt (www),2.531073,578.751412
rheinische post (www),2.248555,338.699422


In [13]:
df[["Newspaper", "diversity index","Length"]].groupby("Newspaper").std()

Unnamed: 0_level_0,diversity index,Length
Newspaper,Unnamed: 1_level_1,Unnamed: 2_level_1
Aachener Zeitung,0.841206,2168.385235
Der Tagesspiegel,0.711268,360.531429
Die Welt,0.787408,390.822211
Rheinische Post,0.829568,240.238762
Stuttgarter Zeitung,0.789963,246.07036
Süddeutsche Zeitung (inkl. Regionalausgaben),0.762955,367.377709
aachener zeitung (www),0.872545,197.696919
der tagesspiegel (www),0.724213,347.385381
die welt (www),0.812121,362.892006
rheinische post (www),0.822334,208.061602


## Means and standard deviations

In [20]:
df[df["modality_dummy"] == 0]["Length"].mean()

460.11660447761193

In [21]:
df[df["modality_dummy"] == 1]["Length"].mean()

499.6452634609847

In [22]:
df[df["reach_dummy"] == 0]["Length"].mean()

399.331282254863

In [23]:
df[df["reach_dummy"] == 1]["Length"].mean()

571.3959398729273

In [24]:
df["Length"].mean()

495.95761900617873

In [25]:
df["Length"].std()

709.936907820832

In [26]:
df[df["modality_dummy"] == 0]["Length"].std()

310.2046257339527

In [27]:
df[df["modality_dummy"] == 1]["Length"].std()

738.806004987597

In [29]:
df[df["reach_dummy"] == 0]["Length"].std()

976.1535881678966

In [28]:
df[df["reach_dummy"] == 1]["Length"].std()

375.0764392797749

In [34]:
print("AZ")
print(df[(df["Newspaper"]== "Aachener Zeitung") | (df["Newspaper"]== "aachener zeitung (www)")]["Length"].mean())
print(df[(df["Newspaper"]== "Aachener Zeitung") | (df["Newspaper"]== "aachener zeitung (www)")]["Length"].std())
print("RP")
print(df[(df["Newspaper"]== "Rheinische Post") | (df["Newspaper"]== "rheinische post (www)")]["Length"].mean())
print(df[(df["Newspaper"]== "Rheinische Post") | (df["Newspaper"]== "rheinische post (www)")]["Length"].std())
print("STZ")
print(df[(df["Newspaper"]== "Stuttgarter Zeitung") | (df["Newspaper"]== "stuttgarter zeitung (www)")]["Length"].mean())
print(df[(df["Newspaper"]== "Stuttgarter Zeitung") | (df["Newspaper"]== "stuttgarter zeitung (www)")]["Length"].std())
print("Welt")
print(df[(df["Newspaper"]== "Die Welt") | (df["Newspaper"]== "die welt (www)")]["Length"].mean())
print(df[(df["Newspaper"]== "Die Welt") | (df["Newspaper"]== "die welt (www)")]["Length"].std())
print("TS")
print(df[(df["Newspaper"]== "Der Tagesspiegel") | (df["Newspaper"]== "der tagesspiegel (www)")]["Length"].mean())
print(df[(df["Newspaper"]== "Der Tagesspiegel") | (df["Newspaper"]== "der tagesspiegel (www)")]["Length"].std())
print("SZ")
print(df[(df["Newspaper"]== "Süddeutsche Zeitung (inkl. Regionalausgaben)") | (df["Newspaper"]== "sueddeutschet politik (www)")]["Length"].mean())
print(df[(df["Newspaper"]== "Süddeutsche Zeitung (inkl. Regionalausgaben)") | (df["Newspaper"]== "sueddeutschet politik (www)")]["Length"].std())

AZ
471.82601054481546
2003.435474521646
RP
374.824568288854
238.3547195936796
STZ
384.49704142011836
248.5880393637322
Welt
740.0763888888889
393.0139000577268
TS
574.8174193548388
358.21639911153187
SZ
526.3810012836971
364.2976978854891


In [None]:
print("AZ")
print(df[(df["Newspaper"]== "Aachener Zeitung") | (df["Newspaper"]== "aachener zeitung (www)")]["diversity index"].mean())
print(df[(df["Newspaper"]== "Aachener Zeitung") | (df["Newspaper"]== "aachener zeitung (www)")]["diversity index"].std())
print("RP")
print(df[(df["Newspaper"]== "Rheinische Post") | (df["Newspaper"]== "rheinische post (www)")]["diversity index"].mean())
print(df[(df["Newspaper"]== "Rheinische Post") | (df["Newspaper"]== "rheinische post (www)")]["diversity index"].std())
print("STZ")
print(df[(df["Newspaper"]== "Stuttgarter Zeitung") | (df["Newspaper"]== "stuttgarter zeitung (www)")]["diversity index"].mean())
print(df[(df["Newspaper"]== "Stuttgarter Zeitung") | (df["Newspaper"]== "stuttgarter zeitung (www)")]["diversity index"].std())
print("Welt")
print(df[(df["Newspaper"]== "Die Welt") | (df["Newspaper"]== "die welt (www)")]["diversity index"].mean())
print(df[(df["Newspaper"]== "Die Welt") | (df["Newspaper"]== "die welt (www)")]["diversity index"].std())
print("TS")
print(df[(df["Newspaper"]== "Der Tagesspiegel") | (df["Newspaper"]== "der tagesspiegel (www)")]["diversity index"].mean())
print(df[(df["Newspaper"]== "Der Tagesspiegel") | (df["Newspaper"]== "der tagesspiegel (www)")]["diversity index"].std())
print("SZ")
print(df[(df["Newspaper"]== "Süddeutsche Zeitung (inkl. Regionalausgaben)") | (df["Newspaper"]== "sueddeutschet politik (www)")]["diversity index"].mean())
print(df[(df["Newspaper"]== "Süddeutsche Zeitung (inkl. Regionalausgaben)") | (df["Newspaper"]== "sueddeutschet politik (www)")]["diversity index"].std())