In [1]:
import pandas as pd

import watermark
%load_ext watermark
%watermark -n -v -m -g -iv

Python implementation: CPython
Python version       : 3.9.0
IPython version      : 7.19.0

Compiler    : Clang 12.0.0 (clang-1200.0.32.27)
OS          : Darwin
Release     : 20.1.0
Machine     : x86_64
Processor   : i386
CPU cores   : 12
Architecture: 64bit

Git hash: 

watermark: 2.1.0
pandas   : 1.1.4



Load user information

In [2]:
user_info_D = pd.read_csv("data/D_user_information.tsv", sep="\t", index_col=[0])
user_info_R = pd.read_csv("data/R_user_information.tsv", sep="\t", index_col=[0])

Load demographics information

In [3]:
demo_D = pd.read_csv("data/D_demographics.tsv", sep="\t", index_col=[0], header=[0, 1])
demo_R = pd.read_csv("data/R_demographics.tsv", sep="\t", index_col=[0], header=[0, 1])

Load number of tweets and date ranges

In [4]:
tweet_count_D = pd.read_csv("results/D_number_of_tweets.tsv", sep="\t", index_col="user_id")
tweet_count_R = pd.read_csv("results/R_number_of_tweets.tsv", sep="\t", index_col="user_id")

Load depression labels

In [5]:
labels = pd.read_csv("data/D_diagnosis_labels.tsv", sep="\t", index_col=[0])

Load thresholds from external file

In [6]:
from util import threshold, gender_thr, age_thr

Check demographic group membership based on thresholds

In [7]:
non_org_D = demo_D[("org", "non-org")] >= threshold
male_D = demo_D[("gender", "male")] >= gender_thr
female_D = demo_D[("gender", "female")] >= gender_thr
under_eightteen_D = demo_D[("age", "<=18")] >= age_thr
nineteen_twentynine_D = demo_D[("age", "19-29")] >= age_thr
thirty_thirtynine_D = demo_D[("age", "30-39")] >= age_thr
over_fourty_D = demo_D[("age", ">=40")] >= age_thr

non_org_R = demo_R[("org", "non-org")] >= threshold
male_R = demo_R[("gender", "male")] >= gender_thr
female_R = demo_R[("gender", "female")] >= gender_thr
under_eightteen_R = demo_R[("age", "<=18")] >= age_thr
nineteen_twentynine_R = demo_R[("age", "19-29")] >= age_thr
thirty_thirtynine_R = demo_R[("age", "30-39")] >= age_thr
over_fourty_R = demo_R[("age", ">=40")] >= age_thr

Select groups of individuals based on demographics information

In [8]:
all_D = demo_D[non_org_D].index
men_D = demo_D[male_D & non_org_D].index
women_D = demo_D[female_D & non_org_D].index
teens_D = demo_D[under_eightteen_D & non_org_D].index
twenties_D = demo_D[nineteen_twentynine_D & non_org_D].index
thirties_D = demo_D[thirty_thirtynine_D & non_org_D].index
adults_D = demo_D[over_fourty_D & non_org_D].index

all_R = demo_R[non_org_R].index
men_R = demo_R[male_R & non_org_R].index
women_R = demo_R[female_R & non_org_R].index
teens_R = demo_R[under_eightteen_R & non_org_R].index
twenties_R = demo_R[nineteen_twentynine_R & non_org_R].index
thirties_R = demo_R[thirty_thirtynine_R & non_org_R].index
adults_R = demo_R[over_fourty_R & non_org_R].index

Determine total users per subcategory to determine percentages 

In [9]:
users_D = user_info_D.shape[0]
filt_D = user_info_D.sum()["in_filtered"]
gender_D = men_D.size + women_D.size
age_D = teens_D.size + twenties_D.size + thirties_D.size + adults_D.size

users_R = user_info_R.shape[0]
filt_R = user_info_R.sum()["in_filtered"]
gender_R = men_R.size + women_R.size
age_R = teens_R.size + twenties_R.size + thirties_R.size + adults_R.size

# Initialize Table

In [10]:
idx = pd.MultiIndex.from_tuples([('total', 'individuals'), ('gender', 'all'), ('gender', 'male'), ('gender', 'female'), 
                                 ('age', 'all'), ('age', '<=18'), ('age', '19-29'), ('age', '30-39'), ('age', '>=40')])

col = pd.MultiIndex.from_tuples([(r"$D$", r"\# accounts"), (r"$D$", r"\# tweets"),
                                 (r"$R$", r"\# accounts"), (r"$R$", r"\# tweets")])

table = pd.DataFrame(index=idx, columns=col, data="")

Fill totals rows

In [11]:
table.loc[('total', 'individuals'), (r"$D$", r"\# accounts")] = r"{:,} (100.00%)".format(all_D.size)
table.loc[('total', 'individuals'), (r"$R$", r"\# accounts")] = r"{:,} (100.00%)".format(all_R.size)

Fill gender rows

In [12]:
table.loc[('gender', 'all'), (r"$D$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(gender_D, 100 * gender_D / all_D.size)
table.loc[('gender', 'all'), (r"$R$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(gender_R, 100 * gender_R / all_R.size)

table.loc[('gender', 'male'), (r"$D$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(men_D.size, 100 * men_D.size / gender_D)
table.loc[('gender', 'male'), (r"$D$", r"\# tweets")] = r"{:,}".format(tweet_count_D.loc[men_D, "count"].sum())
table.loc[('gender', 'male'), (r"$R$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(men_R.size, 100 * men_R.size / gender_R)
table.loc[('gender', 'male'), (r"$R$", r"\# tweets")] = r"{:,}".format(tweet_count_R.loc[men_R, "count"].sum())

table.loc[('gender', 'female'), (r"$D$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(women_D.size, 100 * women_D.size / gender_D)
table.loc[('gender', 'female'), (r"$D$", r"\# tweets")] = r"{:,}".format(tweet_count_D.loc[women_D, "count"].sum())
table.loc[('gender', 'female'), (r"$R$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(women_R.size, 100 * women_R.size / gender_R)
table.loc[('gender', 'female'), (r"$R$", r"\# tweets")] = r"{:,}".format(tweet_count_R.loc[women_R, "count"].sum())

Fill age rows

In [13]:
table.loc[('age', 'all'), (r"$D$", "\# accounts")] = r"{:,} ({:6.2f}%)".format(age_D, 100 * age_D / all_D.size)
table.loc[('age', 'all'), (r"$R$", "\# accounts")] = r"{:,} ({:6.2f}%)".format(age_R, 100 * age_R / all_R.size)

table.loc[('age', '<=18'), (r"$D$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(teens_D.size, 100 * teens_D.size / age_D)
table.loc[('age', '<=18'), (r"$D$", r"\# tweets")] = r"{:,}".format(tweet_count_D.loc[teens_D, "count"].sum())
table.loc[('age', '<=18'), (r"$R$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(teens_R.size, 100 * teens_R.size / age_R)
table.loc[('age', '<=18'), (r"$R$", r"\# tweets")] = r"{:,}".format(tweet_count_R.loc[teens_R, "count"].sum())

table.loc[('age', '19-29'), (r"$D$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(twenties_D.size, 100 * twenties_D.size / age_D)
table.loc[('age', '19-29'), (r"$D$", r"\# tweets")] = r"{:,}".format(tweet_count_D.loc[twenties_D, "count"].sum())
table.loc[('age', '19-29'), (r"$R$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(twenties_R.size, 100 * twenties_R.size / age_R)
table.loc[('age', '19-29'), (r"$R$", r"\# tweets")] = r"{:,}".format(tweet_count_R.loc[twenties_R, "count"].sum())

table.loc[('age', '30-39'), (r"$D$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(thirties_D.size, 100 * thirties_D.size / age_D)
table.loc[('age', '30-39'), (r"$D$", r"\# tweets")] = r"{:,}".format(tweet_count_D.loc[thirties_D, "count"].sum())
table.loc[('age', '30-39'), (r"$R$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(thirties_R.size, 100 * thirties_R.size / age_R)
table.loc[('age', '30-39'), (r"$R$", r"\# tweets")] = r"{:,}".format(tweet_count_R.loc[thirties_R, "count"].sum())

table.loc[('age', '>=40'), (r"$D$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(adults_D.size, 100 * adults_D.size / age_D)
table.loc[('age', '>=40'), (r"$D$", r"\# tweets")] = r"{:,}".format(tweet_count_D.loc[adults_D, "count"].sum())
table.loc[('age', '>=40'), (r"$R$", r"\# accounts")] = r"{:,} ({:6.2f}%)".format(adults_R.size, 100 * adults_R.size / age_R)
table.loc[('age', '>=40'), (r"$R$", r"\# tweets")] = r"{:,}".format(tweet_count_R.loc[adults_R, "count"].sum())

# Table

In [14]:
table

Unnamed: 0_level_0,Unnamed: 1_level_0,$D$,$D$,$R$,$R$
Unnamed: 0_level_1,Unnamed: 1_level_1,\# accounts,\# tweets,\# accounts,\# tweets
total,individuals,"1,035 (100.00%)",,"7,349 (100.00%)",
gender,all,887 ( 85.70%),,"6,231 ( 84.79%)",
gender,male,268 ( 30.21%),400444.0,"3,313 ( 53.17%)",3403224.0
gender,female,619 ( 69.79%),908850.0,"2,918 ( 46.83%)",2504347.0
age,all,687 ( 66.38%),,"4,934 ( 67.14%)",
age,<=18,152 ( 22.13%),158595.0,"1,200 ( 24.32%)",694398.0
age,19-29,318 ( 46.29%),463811.0,"1,648 ( 33.40%)",1483615.0
age,30-39,135 ( 19.65%),245245.0,845 ( 17.13%),998023.0
age,>=40,82 ( 11.94%),134323.0,"1,241 ( 25.15%)",1401708.0


In [15]:
table.to_csv("figures/Table2.tex", sep="&")