# Hypothesis testing: readability

In this Notebook, the readability data is explored and the related hypotheses are tested.

## Loading packages and data

In [2]:
#import the necessary packages
import pandas as pd
from pandas import read_excel
from scipy import stats
from scipy.stats import mannwhitneyu

In [3]:
#read and inspect the data
df = read_excel("complete_data_cleaned_with_readability2.xlsx")
df.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,Newspaper,Date,Length,Category,Author,Headline,Teaser,Article,Modality,url,clean text,words in clean text,reach_dummy,modality_dummy,readabilityscore_text
0,0,6,100006,sueddeutschet politik (www),2020-05-28T15:34:08,367,,,SZ Espresso: Nachrichten kompakt - die Übersic...,<p>Was heute wichtig war - und was Sie auf SZ....,Das Wichtigste zum Coronavirus. Berufstätige M...,online,https://www.sueddeutsche.de/politik/nachrichte...,"das wichtig coronavirus . berufstat mutt vat ,...",224,1,0,44.6
1,1,8,100008,sueddeutschet politik (www),2020-05-28T17:01:43,200,,,Kommunalpolitik: Abgeblendet,<p>Bayreuths Stadtrat im Stream</p>,"Livestream aus dem Stadtrat, das klingt transp...",online,https://www.sueddeutsche.de/bayern/kommunalpol...,"livestream stadtrat , klingt transparent erstr...",104,1,0,65.25
2,2,24,100024,aachener zeitung (www),2020-05-28T03:01:52,512,Politik,,Länder planen Öffnung: Streit über Schulen und...,"<img src=""https://www.aachener-zeitung.de/imgs...",Der Streit über die Wiederöffnung von Schulen ...,online,https://www.aachener-zeitung.de/politik/deutsc...,der streit wiederoffn schul kindergart kris ve...,318,0,0,38.6


## Select relevant columns

In [4]:
df = df[["ID", "Newspaper", "Length", "Article", "modality_dummy", "reach_dummy", "clean text", "readabilityscore_text"]]
len(df)

11491

## Descriptives

In [5]:
print("Overall sample")
print(df["readabilityscore_text"].mean())
print(df["readabilityscore_text"].std())

Overall sample
40.68040901575143
10.668379923377481


In [6]:
print("Print sample")
print(df[df["modality_dummy"] == 1]["readabilityscore_text"].mean())
print(df[df["modality_dummy"] == 1]["readabilityscore_text"].std())
print("")
print("Online sample")
print(df[df["modality_dummy"] == 0]["readabilityscore_text"].mean())
print(df[df["modality_dummy"] == 0]["readabilityscore_text"].std())

Print sample
40.682172953258465
10.725494331840776

Online sample
40.6632649253731
10.101209409464657


In [7]:
print("National sample")
print(df[df["reach_dummy"] == 1]["readabilityscore_text"].mean())
print(df[df["reach_dummy"] == 1]["readabilityscore_text"].std())
print("")
print("Regional sample")
print(df[df["reach_dummy"] == 0]["readabilityscore_text"].mean())
print(df[df["reach_dummy"] == 0]["readabilityscore_text"].std())

National sample
40.69470323880385
10.482157527661116

Regional sample
40.662100039698274
10.903279225630387


## Hypothesis test

In [8]:
print("RQ3a: Reach")
print(mannwhitneyu(df[df["reach_dummy"] == 1]["readabilityscore_text"], df[df["reach_dummy"] == 0]["readabilityscore_text"]))

RQ3a: Reach
MannwhitneyuResult(statistic=16113271.0, pvalue=0.2107452068739734)


In [9]:
print("RQ3b: Moality")
print(mannwhitneyu(df[df["modality_dummy"] == 1]["readabilityscore_text"], df[df["modality_dummy"] == 0]["readabilityscore_text"]))

RQ3b: Moality
MannwhitneyuResult(statistic=5494715.0, pvalue=0.1924394750255477)


## Comparisons

In [10]:
#grouping by outlet
df.groupby("Newspaper").mean()

Unnamed: 0_level_0,ID,Length,modality_dummy,reach_dummy,readabilityscore_text
Newspaper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aachener Zeitung,493.615464,484.008247,1.0,0.0,38.610464
Der Tagesspiegel,10928.010886,574.946345,1.0,1.0,40.177302
Die Welt,1408.464501,774.438026,1.0,1.0,41.601264
Rheinische Post,3487.029474,377.456,1.0,0.0,41.84648
Stuttgarter Zeitung,5750.415521,394.241714,1.0,0.0,40.650226
Süddeutsche Zeitung (inkl. Regionalausgaben),8342.211828,529.366129,1.0,1.0,40.458879
aachener zeitung (www),104215.535714,401.488095,0.0,0.0,36.289286
der tagesspiegel (www),104147.094697,574.189394,0.0,1.0,43.03428
die welt (www),103980.39548,578.751412,0.0,1.0,39.518588
rheinische post (www),104260.479769,338.699422,0.0,0.0,39.336358


In [16]:
#grouping by outlet
df.groupby("Newspaper").std()

Unnamed: 0_level_0,ID,Length,modality_dummy,reach_dummy,readabilityscore_text
Newspaper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aachener Zeitung,286.104359,2168.385235,0.0,0.0,11.827011
Der Tagesspiegel,375.730454,360.531429,0.0,0.0,11.630196
Die Welt,240.577279,390.822211,0.0,0.0,9.868298
Rheinische Post,945.658182,240.238762,0.0,0.0,10.817186
Stuttgarter Zeitung,373.245187,246.07036,0.0,0.0,10.447508
Süddeutsche Zeitung (inkl. Regionalausgaben),1120.275724,367.377709,0.0,0.0,10.20062
aachener zeitung (www),2451.087736,197.696919,0.0,0.0,8.294884
der tagesspiegel (www),2213.800853,347.385381,0.0,0.0,9.301713
die welt (www),2289.970877,362.892006,0.0,0.0,10.735843
rheinische post (www),2246.251336,208.061602,0.0,0.0,9.885701


In [12]:
from statsmodels.formula.api import ols

In [13]:
model = ols("readabilityscore_text ~ modality_dummy + reach_dummy + 1", df).fit()
print(model.summary()) 

                              OLS Regression Results                             
Dep. Variable:     readabilityscore_text   R-squared:                       0.000
Model:                               OLS   Adj. R-squared:                 -0.000
Method:                    Least Squares   F-statistic:                   0.01481
Date:                   Thu, 18 Jun 2020   Prob (F-statistic):              0.985
Time:                           20:50:17   Log-Likelihood:                -43507.
No. Observations:                  11491   AIC:                         8.702e+04
Df Residuals:                      11488   BIC:                         8.704e+04
Df Model:                              2                                         
Covariance Type:               nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept     

In [15]:
print("AZ")
print(df[(df["Newspaper"]== "Aachener Zeitung") | (df["Newspaper"]== "aachener zeitung (www)")]["readabilityscore_text"].mean())
print(df[(df["Newspaper"]== "Aachener Zeitung") | (df["Newspaper"]== "aachener zeitung (www)")]["readabilityscore_text"].std())
print("RP")
print(df[(df["Newspaper"]== "Rheinische Post") | (df["Newspaper"]== "rheinische post (www)")]["readabilityscore_text"].mean())
print(df[(df["Newspaper"]== "Rheinische Post") | (df["Newspaper"]== "rheinische post (www)")]["readabilityscore_text"].std())
print("STZ")
print(df[(df["Newspaper"]== "Stuttgarter Zeitung") | (df["Newspaper"]== "stuttgarter zeitung (www)")]["readabilityscore_text"].mean())
print(df[(df["Newspaper"]== "Stuttgarter Zeitung") | (df["Newspaper"]== "stuttgarter zeitung (www)")]["readabilityscore_text"].std())
print("Welt")
print(df[(df["Newspaper"]== "Die Welt") | (df["Newspaper"]== "die welt (www)")]["readabilityscore_text"].mean())
print(df[(df["Newspaper"]== "Die Welt") | (df["Newspaper"]== "die welt (www)")]["readabilityscore_text"].std())
print("TS")
print(df[(df["Newspaper"]== "Der Tagesspiegel") | (df["Newspaper"]== "der tagesspiegel (www)")]["readabilityscore_text"].mean())
print(df[(df["Newspaper"]== "Der Tagesspiegel") | (df["Newspaper"]== "der tagesspiegel (www)")]["readabilityscore_text"].std())
print("SZ")
print(df[(df["Newspaper"]== "Süddeutsche Zeitung (inkl. Regionalausgaben)") | (df["Newspaper"]== "sueddeutschet politik (www)")]["readabilityscore_text"].mean())
print(df[(df["Newspaper"]== "Süddeutsche Zeitung (inkl. Regionalausgaben)") | (df["Newspaper"]== "sueddeutschet politik (www)")]["readabilityscore_text"].std())

AZ
38.26779437609848
11.401520719902733
RP
41.676051805337465
10.773216142574956
STZ
40.766516272189314
10.403894707576457
Welt
41.23555555555558
10.051829107680781
TS
40.66390967741932
11.316049706509911
SZ
40.56698844672657
10.24177508547756
