In [None]:
import os
import json
import gzip
import random
import logging

import numpy as np
import pandas as pd

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Tweets and users

## Load data

In [None]:
user_show = pd.read_csv('data/samples/twitter/user-show.csv')

In [None]:
user_ids = set(pd.read_csv(
    'data/twitter/tweets-processed.csv',
    index_col='id',
    parse_dates=['timestamp']
)['user_id'].unique().tolist())

In [None]:
follow_graph = pd.read_csv('data/samples/twitter/follow-graph.csv')
fg_user_ids = set(follow_graph['source_user_id'].tolist() + follow_graph['target_user_id'].tolist())

### 2019-2020

In [None]:
elite_data = pd.read_csv('data/twitter/tweets-processed.csv', index_col='id',
                         parse_dates=['timestamp'])

elite_data = elite_data.loc[
    ((elite_data['timestamp'] >= '2019-09-01') & (elite_data['timestamp'] <= '2019-11-01')) |
    ((elite_data['timestamp'] >= '2020-03-01') & (elite_data['timestamp'] <= '2020-05-01')),
:]

elite_data = elite_data.loc[~elite_data['content'].isna(), :]
# => 2_051_393

### 2021

In [None]:
tw2021 = pd.read_csv('data/paper-round-3/twitter/social-polls-2021.csv.gz', sep='\t')

tw2021 = tw2021.loc[tw2021['user_id'].isin(user_ids)]

assert tw2021['id'].nunique() == tw2021['id'].shape[0]

## Count of tweets

In [None]:
elite_data.shape[0]
# => 2_051_393

In [None]:
tw2021.shape[0]
# => 551_187

In [None]:
elite_data.shape[0] + tw2021.shape[0]
# => 2_602_580

## Count of users with tweets

Not all users have tweets, so our final number of users is higher than the one below.

In [None]:
elite_data['user_id'].nunique()
# => 2_051_393

In [None]:
tw2021['user_id'].nunique()
# => 551_187

In [None]:
pd.concat([elite_data['user_id'], tw2021['user_id']]).nunique()

## Count of users period

In [None]:
len(user_ids)

In [None]:
len(fg_user_ids)

In [None]:
len(user_ids | fg_user_ids)

## Count of Twitter-matched shows

### With a user ID at all

This includes a couple shows whose collected users all turned out to have protected accounts. We've ignored them in all subsequent analysis.

In [None]:
user_show['show_id'].nunique()

### With any Twitter data

In [None]:
user_show.loc[user_show['user_id'].isin(user_ids) | user_show['user_id'].isin(fg_user_ids), 'show_id'].nunique()

### With tweets

In [None]:
user_show.loc[user_show['user_id'].isin(user_ids), 'show_id'].nunique()

# Count of Twitter-matched users

In [None]:
user_show['user_id'].nunique()

In [None]:
user_show.loc[user_show['user_id'].isin(user_ids), 'user_id'].nunique()

In [None]:
user_show.loc[user_show['user_id'].isin(fg_user_ids), 'user_id'].nunique()

In [None]:
user_show.loc[user_show['user_id'].isin(user_ids) | user_show['user_id'].isin(fg_user_ids), 'user_id'].nunique()