Notebook was used in finding the best-performing subgroup of users.

In [6]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.model_selection
import sktime.datatypes
from sklearn.linear_model import RidgeClassifierCV
from sklearn.pipeline import make_pipeline

from sktime.datasets import load_arrow_head  # univariate dataset
from sktime.datasets import load_basic_motions  # multivariate dataset
from sktime.transformations.panel.rocket import Rocket
from sktime.forecasting.model_selection import temporal_train_test_split
from sklearn import preprocessing
import tensorflow as tf
import tensorboard

#depression tweets
dt = pd.read_csv("data/depression_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#dt["created_at"] = pd.to_datetime(dt["created_at"])
n_depression = len(dt['user_id'].unique())
print(n_depression)

#sample tweets
st = pd.read_csv("data/sample_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#st["created_at"] = pd.to_datetime(st["created_at"])

#filtered users
fu = pd.read_csv('filtered_user_lists.csv')
fu_tbt = pd.read_csv('filtered_user_lists_only_tpd_tbt.csv')
fu_maxtime = pd.read_csv('filtered_user_lists_only_tpd_maxtime.csv')
#tpd pairs
tpd_pairs = pd.read_csv('pairs_tpd_users.csv')
filtered_users = pd.concat([fu, fu_tbt, fu_maxtime], ignore_index=True)
filtered_users.head()


603


Unnamed: 0.1,Unnamed: 0,min_tpd,max_days_bt,min_days_active,user_list
0,0,1,1,30,"['uR0001', 'uR0002', 'uR0003', 'uR0007', 'uR00..."
1,1,1,1,60,"['uR0001', 'uR0002', 'uR0003', 'uR0007', 'uR00..."
2,2,1,1,90,"['uR0001', 'uR0002', 'uR0003', 'uR0007', 'uR00..."
3,3,1,1,120,"['uR0001', 'uR0002', 'uR0003', 'uR0007', 'uR00..."
4,4,1,1,150,"['uR0001', 'uR0002', 'uR0003', 'uR0007', 'uR00..."


In [7]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.model_selection import train_test_split

writer = tf.summary.create_file_writer('log/filtered_sampling_test_new_sampling')
result = []

with writer.as_default():
    for j in tqdm(range(0, 1030)):
        current = filtered_users.iloc[j]
        #print(current)
        current_user_list = eval(current.user_list)

        #get distribution of current filtered list
        avg_dist = {}
        current_tpds = tpd_pairs[tpd_pairs['user id'].isin(set(current_user_list))]
        for avg in current_tpds['tweets per day'].unique():
            tmp_dist = float((current_tpds[current_tpds['tweets per day']==avg].count()/len(current_tpds)).iloc[0])
            avg_dist.update({str(avg): str(tmp_dist)})

        dist_df = pd.DataFrame(avg_dist.items(), columns=['tweets per day', 'percentage'])
        dist_df = dist_df.astype({'tweets per day': 'int32'})
        dist_df = dist_df.astype({'percentage': 'float'})
        dist_df = dist_df.sort_values(by=['tweets per day'])

        #downsample current user list based on distribution
        current_samples = pd.DataFrame(columns=['user id', 'tweets per day'])
        for tpd in current_tpds['tweets per day'].unique():
            tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
            frac = float(dist_df.loc[dist_df['tweets per day']==tpd]['percentage'].iloc[0])
            if int(n_depression*frac)<len(tmp_df):
                sampled_tmp_df = tmp_df.sample(n=int(n_depression*frac), random_state=42)
            else:
                sampled_tmp_df = tmp_df
            current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)

        present_tpd = current_samples['tweets per day'].unique().tolist()
        all_tpd = dist_df['tweets per day'].unique().tolist()
        diff_tpd = list(set(present_tpd).symmetric_difference(set(all_tpd)))
        dist_df = dist_df.sort_values('percentage', ascending=False) #sort percentages here

        #fill up remaining spots with random users from rs cohort not yet present in sampled users starting with most occuring tpd class in filtered user list (all tpd not already in sampled list)
        needed_tpd = []
        counter = n_depression-len(current_samples['user id'].unique())
        if counter > 0:
            for tpd in dist_df['tweets per day'].unique():
                if tpd in diff_tpd and counter > 0:
                    tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
                    #make sure to not sample a user again
                    sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
                    if current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                        unique = False
                        stop = len(current_tpds['user id'].unique())
                        while (not unique) or (stop>0):
                            sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
                            if not current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                                unique = True
                            stop = stop - 1

                    current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)
                    counter = counter - 1

        current_user_list = list(set(st.user_id.unique()).symmetric_difference(set(current_samples['user id'].unique())))

        avg_dist = {}
        current_tpds = tpd_pairs[tpd_pairs['user id'].isin(set(current_user_list))]
        for avg in current_tpds['tweets per day'].unique():
            tmp_dist = float((current_tpds[current_tpds['tweets per day']==avg].count()/len(current_tpds)).iloc[0])
            avg_dist.update({str(avg): str(tmp_dist)})

        dist_df = pd.DataFrame(avg_dist.items(), columns=['tweets per day', 'percentage'])
        dist_df = dist_df.astype({'tweets per day': 'int32'})
        dist_df = dist_df.astype({'percentage': 'float'})
        dist_df = dist_df.sort_values(by=['tweets per day'])

        for tpd in current_tpds['tweets per day'].unique():
            tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
            frac = float(dist_df.loc[dist_df['tweets per day']==tpd]['percentage'].iloc[0])
            if int(n_depression*frac)<len(tmp_df):
                sampled_tmp_df = tmp_df.sample(n=int(n_depression*frac), random_state=42)
            else:
                sampled_tmp_df = tmp_df
            current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)

        present_tpd = current_samples['tweets per day'].unique().tolist()
        all_tpd = dist_df['tweets per day'].unique().tolist()
        diff_tpd = list(set(present_tpd).symmetric_difference(set(all_tpd)))
        dist_df = dist_df.sort_values('percentage', ascending=False) #sort percentages here

        needed_tpd = []
        counter = n_depression-len(current_samples['user id'].unique())
        while counter > 0:
            for tpd in dist_df['tweets per day'].unique():
                if tpd in diff_tpd and counter > 0:
                    tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
                    #make sure to not sample a user again
                    sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
                    if current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                        unique = False
                        stop = len(current_tpds['user id'].unique())
                        while (not unique) or (stop>0):
                            sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
                            if not current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                                unique = True
                            stop = stop - 1

                    current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)
                    counter = counter - 1

        #upsample/downsample depression cohort accordingly
        data = {}
        labels = []
        for user in dt.user_id.unique():
            tmp = dt[dt['user_id'] == user]
            tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
            data.update({'' + user + '': tweet_list})
            labels.append(1)

        for user in current_samples['user id'].unique():
            tmp = st[st['user_id'] == user]
            tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
            data.update({'' + user + '': tweet_list})
            labels.append(0)

        df_data = pd.DataFrame(dict([(key, pd.Series(value)) for key, value in data.items()]))
        df_labels = pd.DataFrame(labels)

        df_data = df_data.transpose()
        #df_data = df_data.drop(['Unnamed: 0'])
        #df_labels = df_labels.drop(columns=['Unnamed: 0'])
        df_data.index = range(0, len(df_data.index))
        df_data.sort_index()
        df_data = df_data.fillna(0)
        X_train, X_test, y_train, y_test = train_test_split(df_data, df_labels, test_size=0.2)

        classifier_pipe = make_pipeline(StandardScaler(), SVC())


        classifier_pipe.fit(X_train, y_train)

        score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])
        tf.summary.scalar('score', score, j)

        result.append({'min_tpd': current['min_tpd'], 'max_days_bt': current['max_days_bt'], 'min_days_active': current['min_days_active'], 'user_list': current_user_list, 'result': score})

df_filtered = pd.DataFrame(result)
df_filtered.to_csv('tuning_results_new_split_new_sampling.csv')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [59]:
fs_results1 = pd.read_csv('tuning_results.csv')
fs_results2 = pd.read_csv('tuning_results2.csv')
filtered_results_new_sampling = pd.read_csv('tuning_results_new_split_new_sampling.csv')

fu = pd.read_csv('filtered_user_lists.csv')
fu_tbt = pd.read_csv('filtered_user_lists_only_tpd_tbt.csv')
fu_maxtime = pd.read_csv('filtered_user_lists_only_tpd_maxtime.csv')


filtered_results_new_sampling['result'].sort_values(ascending=False)

948     0.747159
965     0.744382
424     0.736544
1023    0.732394
903     0.731638
          ...   
564     0.567164
541     0.567164
531     0.559701
543     0.552239
582     0.544776
Name: result, Length: 1030, dtype: float64

In [40]:
tmp_list = filtered_results_new_sampling.iloc[948].user_list
filtered_results_new_sampling.iloc[948]

Unnamed: 0                                                       948
min_tpd                                                           10
max_days_bt                                                        1
min_days_active                                                   30
user_list          ['uR5133', 'uR5187', 'uR4528', 'uR5839', 'uR51...
result                                                      0.747159
Name: 948, dtype: object

In [51]:
filtered_users = pd.concat([fu, fu_tbt, fu_maxtime], ignore_index=True)

snip1 = filtered_users[filtered_users['min_tpd']==10]
snip2 = snip1[snip1['max_days_bt']==1]
snip3 = snip2[snip2['min_days_active']==30]
snip3
#len(eval(filtered_users.iloc[948].user_list))

Unnamed: 0.1,Unnamed: 0,min_tpd,max_days_bt,min_days_active,user_list
756,756,10,1,30,"['uR0011', 'uR0014', 'uR0015', 'uR0022', 'uR00..."
948,108,10,1,30,"['uR0011', 'uR0014', 'uR0015', 'uR0022', 'uR00..."


In [56]:
set(eval(filtered_results_new_sampling.iloc[948].user_list)) == set(eval(filtered_users.iloc[756].user_list))

False

In [58]:
list(set(eval(filtered_users.iloc[756].user_list)).symmetric_difference(set(eval(tmp_list))))

['uR4782',
 'uR1750',
 'uR5056',
 'uR0233',
 'uR5187',
 'uR7126',
 'uR4528',
 'uR5839',
 'uR5149',
 'uR6114',
 'uR1220',
 'uR6509',
 'uR0946',
 'uR6608',
 'uR0272',
 'uR6179',
 'uR4362',
 'uR5138',
 'uR7276',
 'uR2323',
 'uR5131',
 'uR1029',
 'uR2706',
 'uR3169',
 'uR0340',
 'uR3341',
 'uR3612',
 'uR1085',
 'uR0634',
 'uR5513',
 'uR1006',
 'uR6164',
 'uR3791',
 'uR5484',
 'uR0542',
 'uR2601',
 'uR2703',
 'uR1929',
 'uR5151',
 'uR2773',
 'uR1331',
 'uR5265',
 'uR1792',
 'uR5724',
 'uR4027',
 'uR0818',
 'uR2457',
 'uR0300',
 'uR3605',
 'uR1383',
 'uR6147',
 'uR0067',
 'uR3557',
 'uR3839',
 'uR2299',
 'uR4497',
 'uR2762',
 'uR0703',
 'uR6640',
 'uR6752',
 'uR3337',
 'uR3590',
 'uR0066',
 'uR5261',
 'uR1951',
 'uR7063',
 'uR1603',
 'uR2536',
 'uR3487',
 'uR0420',
 'uR0727',
 'uR4405',
 'uR4236',
 'uR3645',
 'uR2816',
 'uR6019',
 'uR3750',
 'uR0102',
 'uR1925',
 'uR1572',
 'uR0051',
 'uR1562',
 'uR1413',
 'uR2163',
 'uR0169',
 'uR5962',
 'uR5530',
 'uR1837',
 'uR3498',
 'uR4299',
 'uR6523',