In [1]:
import re

import numpy as np
import pandas as pd

Load cognitive distortion schemata (CDS)

In [2]:
_CDS = pd.read_csv("data/list_of_CDS.tsv", sep="\t", index_col="markers")
_CDS["variants"].fillna("[]", inplace=True)
_CDS["variants"] = _CDS["variants"].apply(eval)

Determine n-gram size for each schema, label schemata that contain first person pronouns, and group CDS per category.

In [3]:
_CDS["ngram"] = _CDS.apply(lambda x: len(x.name.split(" ")), axis=1)
_CDS["FPP"] = _CDS.apply(lambda x: any(re.search(r"\b{}\b".format(fpp), x.name) for fpp in ["I", "me", "my", "myself", "mine"]), axis=1)

per_cat = _CDS.groupby("categories")

Load CDS results per tweet

In [4]:
CDS_D = pd.read_csv("results/D_per_tweet_CDS.tsv.gz", sep="\t", index_col=[0])
CDS_R = pd.read_csv("results/R_per_tweet_CDS.tsv.gz", sep="\t", index_col=[0])

Load relative prevalence per category

In [5]:
RP_category = pd.read_csv("bootstrap/relative_prevalence_category.tsv", sep="\t", index_col=[0])

Determine which CDS appear in either of both cohorts.

In [6]:
exists = _CDS[(CDS_D.sum() > 0) | (CDS_R.sum() > 0)]

# Initialize Table

In [7]:
order = RP_category.median().sort_values(ascending=False).index
idxvals = np.concatenate((order, np.array(['Total'])), axis=None)
table = pd.DataFrame(index=pd.Index(idxvals, name="CD Category"))

Count schemata per category

In [8]:
table[r"$N_{CD}$"] = per_cat.count()["variants"]
table.loc["Total", r"$N_{CD}$"] = _CDS.index.size
table[r"$N_{CD}$"] = table[r"$N_{CD}$"].astype(int)

Count schemata that occur in at least one of both cohorts

In [9]:
table[r"$N_\exists$"] = exists.groupby("categories").count()["variants"]
table.loc["Total", r"$N_\exists$"] = exists.index.size
table[r"$N_\exists$"] = table[r"$N_\exists$"].astype(int)

Calculate average n-gram size of schemata

In [10]:
table[r"$\bar{n}$"] = np.around(per_cat.mean()["ngram"], decimals=3)
table.loc["Total", r"$\bar{n}$"] = np.around(_CDS.mean()["ngram"], decimals=3)

Calculate percentage of CDS that contain a first person pronoun

In [11]:
table[r"$FPP(\%)$"] = np.around(100 * per_cat.sum()["FPP"] / per_cat.count()["FPP"], decimals=1).replace(0, "/")
table.loc["Total", r"$FPP(\%)$"] = np.around(100 * _CDS["FPP"].sum() / _CDS["FPP"].count(), decimals=1)

# Table

In [12]:
table

Unnamed: 0_level_0,$N_{CD}$,$N_\exists$,$\bar{n}$,$FPP(\%)$
CD Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Personalizing,14,14,2.429,100
Emotional Reasoning,7,7,2.857,42.9
Overgeneralizing,21,21,2.762,9.5
Mental Filtering,14,14,2.786,35.7
Disqualifying the Positive,14,13,2.286,/
Labeling and mislabeling,44,44,2.273,6.8
Dichotomous Reasoning,23,23,1.348,/
Fortune-telling,8,8,3.125,12.5
Magnification and Minimization,8,8,2.0,/
Should statements,5,5,1.4,/


In [13]:
table.to_csv("figures/TableS1.tex", sep="&")