In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [2]:
home_dir = '/g100/home/userexternal/mhabibi0/'
work_dir = '/g100_work/IscrC_mental'

hdata_dir = os.path.join(home_dir, 'Data')
wdata_dir = os.path.join(work_dir, 'data')
uc_dir = os.path.join(wdata_dir, 'user_classification')
model_dir = os.path.join(home_dir, 'Models', 'Age')

In [3]:
# user age data train
path  = os.path.join(uc_dir, 'data_for_models_train.pkl')
df = pd.read_pickle(path)

# Discretize the 'age' column into four classes
age_intervals = [0, 19, 30, 40, 100]
age_labels = [0, 1, 2, 3]
df = df[df['age']<=99]

df['age_class'] = pd.cut(df['age'], bins=age_intervals, labels=age_labels, right=False).astype(int)
df['male'] = df['is_male'].astype(int)

# train valid split
df_train, df_valid = train_test_split(df[['user_id', 'age', 'age_class', 'male', 'masked_bio']], test_size=0.1, random_state=42)

In [4]:
# user age data test
path  = os.path.join(uc_dir, 'data_for_models_test.pkl')
df_test = pd.read_pickle(path)

# Discretize the 'age' column into four classes
age_intervals = [0, 19, 30, 40, 100]
age_labels = [0, 1, 2, 3]
df_test = df_test[df_test['age']<=99]
df_test['age_class'] = pd.cut(df_test['age'], bins=age_intervals, labels=age_labels, right=False).astype(int)

df_test['male'] = df_test['is_male'].astype(int)

In [6]:
# percentages :
obs_train = df_train.shape[0]
obs_valid = df_valid.shape[0]
obs_test = df_test.shape[0]
obs_all = obs_train + obs_valid + obs_test

print(f'N train: {obs_train}')
print(f'N valid: {obs_valid}')
print(f'N test: {obs_test}')
print(f'% training data: {100 * obs_train/obs_all}')
print(f'% valid data: {100 * obs_valid/obs_all}')
print(f'% test data: {100 * obs_test/obs_all}')

N train: 17262
N valid: 1919
N test: 1119
% training data: 85.03448275862068
% valid data: 9.45320197044335
% test data: 5.512315270935961


In [7]:
# share females

print(f"share females train: {np.round(100*(1- df_train['male'].mean()),1)}%")
print(f"share females valid: {np.round(100*(1- df_valid['male'].mean()),1)}%")
print(f"share females test: {np.round(100*(1- df_test['male'].mean()),1)}%")

share females train: 37.2%
share females valid: 37.4%
share females test: 35.7%


In [8]:
# average ages

print(f"average age train: {np.round(df_train['age'].mean(),1)}")
print(f"average age valid: {np.round(df_valid['age'].mean(),1)}")
print(f"average age test: {np.round(df_test['age'].mean(),1)}")

average age train: 42.3
average age valid: 42.7
average age test: 42.3


In [9]:
# with bio 

share_train_with_bio = df_train[df_train['masked_bio']!=''].shape[0] / obs_train
share_valid_with_bio = df_valid[df_valid['masked_bio']!=''].shape[0] / obs_valid
share_test_with_bio = df_test[df_test['masked_bio']!=''].shape[0] / obs_test

print(f"share w bio train: {np.round(share_train_with_bio,2)}")
print(f"share w bio valid: {np.round(share_valid_with_bio ,2)}")
print(f"share w bio test: {np.round(share_test_with_bio,2)}")

share w bio train: 0.47
share w bio valid: 0.45
share w bio test: 0.46


In [10]:
# N tweets by User
path = os.path.join(uc_dir, 'tweets_by_user_id_clean.pkl')
df_twt = pd.read_pickle(path).drop(columns=['text', 'created_at'])

df_twt['RT'] = df_twt['RT'].astype(int)


In [11]:
agg_func= {'tweet_id': 'count', 'RT': 'sum'}
df_utwt = df_twt.groupby('user_id').agg(agg_func).reset_index().\
            rename(columns={'tweet_id': 'n_tweets', 'RT': 'n_rt'})

df_train_twt = df_train.merge(df_utwt, on='user_id')
df_valid_twt = df_valid.merge(df_utwt, on='user_id')
df_test_twt = df_test.merge(df_utwt, on='user_id')

In [12]:
# number of tweets:

n_tweets_train = df_train_twt['n_tweets'].sum()
n_tweets_valid = df_valid_twt['n_tweets'].sum()
n_tweets_test = df_test_twt['n_tweets'].sum()

print(f'n tweets in train: {n_tweets_train}')
print(f'n tweets in valid: {n_tweets_valid}')
print(f'n tweets in test: {n_tweets_test}')

n tweets in train: 24504398
n tweets in valid: 2763834
n tweets in test: 1571178


In [17]:
# share retweets

n_rt_train = df_train_twt['n_rt'].sum()
n_rt_valid = df_valid_twt['n_rt'].sum()
n_rt_test = df_test_twt['n_rt'].sum()

print(f'share RT in train: {np.round(n_tweets_train /n_tweets_train ,3)}')
print(f'share RT in valid: {np.round(n_rt_valid/n_tweets_valid ,3)}')
print(f'share RT in test: {np.round(n_rt_test/n_tweets_test ,3)}')

share RT in train: 0.238
share RT in valid: 0.246
share RT in test: 0.195


In [18]:
# Tweets per user

print(f'Tweet/User train: {np.round(n_rt_train/obs_train  ,1)}')
print(f'Tweet/User valid: {np.round(n_rt_valid/obs_valid  ,1)}')
print(f'Tweet/User test: {np.round(n_rt_test/obs_test  ,1)}')

Tweet/User train: 337.9
Tweet/User valid: 354.7
Tweet/User test: 273.7


In [14]:
# share of users with images
himg_dir = os.path.join(hdata_dir, 'images')
himg_test_dir = os.path.join(himg_dir, 'test')
himg_train_dir = os.path.join(himg_dir, 'train')

images_train_valid = os.listdir(os.path.join(himg_train_dir, 'images_resized'))
images_test = os.listdir(os.path.join(himg_test_dir, 'images_resized'))

images_id_train_valid = [int(f.split('.')[0]) for f in images_train_valid]
images_id_test = [int(f.split('.')[0]) for f in images_test]

df_train['has_image'] = df_train['user_id'].apply(lambda x: 1 if x in images_id_train_valid else 0)
df_valid['has_image'] = df_valid['user_id'].apply(lambda x: 1 if x in images_id_train_valid else 0)
df_test['has_image'] = df_test['user_id'].apply(lambda x: 1 if x in images_id_test else 0)


print(f"share w img train: {np.round(df_train['has_image'].mean(),2)}")
print(f"share w img valid: {np.round(df_valid['has_image'].mean(),2)}")
print(f"share w img test: {np.round(df_test['has_image'].mean(),2)}")

share w img train: 0.8
share w img valid: 0.78
share w img test: 0.99


In [19]:
## location
import sqlite3  as sql    
dbase_path = os.path.join(work_dir, 'data', 'database', 'MENTALISM.db')
connection = sql.connect(dbase_path)
cursor = connection.cursor()

In [36]:
# Query to get all table names
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# # Print the table names
# for table in tables:
#     print(table[0])
    
df_geo = pd.read_sql('SELECT user_id, foreign_country, region_code, city_id  FROM user_geocoded', connection)

df_geo['foreign'] = pd.to_numeric(df_train_geo['foreign_country'], errors = 'coerce') 
df_geo['city_code'] = pd.to_numeric(df_train_geo['city_id'], errors = 'coerce') 
df_geo = df_geo[['user_id', 'foreign', 'region_code' , 'city_code']]

tweets
user
user_geocoded_old
user_geocoded


In [37]:
df_train_geo = df_train.merge(df_geo, on='user_id')
df_valid_geo = df_valid.merge(df_geo, on='user_id')
df_test_geo = df_test.merge(df_geo, on='user_id')

In [44]:
# percent with location

n_wloc_train = len(df_train_geo[(df_train_geo['region_code'].notna()) | (df_train_geo['foreign'].notna())]) 
n_wloc_valid = len(df_valid_geo[(df_valid_geo['region_code'].notna()) | (df_valid_geo['foreign'].notna())]) 
n_wloc_test = len(df_test_geo[(df_test_geo['region_code'].notna()) | (df_test_geo['foreign'].notna())]) 

print(f"share w. loc train: {np.round(100 * n_wloc_train/ obs_train,1)}%")
print(f"share w. loc valid: {np.round(100 * n_wloc_valid/ obs_valid,1)}%")
print(f"share w. loc test: {np.round(100 * n_wloc_test/ obs_test,1)}%")

share w. loc train: 43.0%
share w. loc valid: 41.1%
share w. loc test: 47.2%
