This notebook was used in the filtering process.

In [1]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.model_selection
import sktime.datatypes
from sklearn.linear_model import RidgeClassifierCV
from sklearn.pipeline import make_pipeline

from sktime.datasets import load_arrow_head  # univariate dataset
from sktime.datasets import load_basic_motions  # multivariate dataset
from sktime.transformations.panel.rocket import Rocket
from sktime.forecasting.model_selection import temporal_train_test_split
from sklearn import preprocessing
import tensorflow as tf
import tensorboard

In [2]:
#depression tweets
dt = pd.read_csv("data/depression_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#dt["created_at"] = pd.to_datetime(dt["created_at"])
n_depression = len(dt['user_id'].unique())
print(n_depression)

#sample tweets
st = pd.read_csv("data/sample_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#st["created_at"] = pd.to_datetime(st["created_at"])

#filtered users
fu = pd.read_csv('filtered_user_lists.csv')
fu_tbt = pd.read_csv('filtered_user_lists_only_tpd_tbt.csv')
fu_maxtime = pd.read_csv('filtered_user_lists_only_tpd_maxtime.csv')
#tpd pairs
tpd_pairs = pd.read_csv('pairs_tpd_users.csv')

603


In [3]:
filtered_users = pd.concat([fu, fu_tbt, fu_maxtime], ignore_index=True)
filtered_users.head()

Unnamed: 0.1,Unnamed: 0,min_tpd,max_days_bt,min_days_active,user_list
0,0,1,1,30,"['uR0001', 'uR0002', 'uR0003', 'uR0007', 'uR00..."
1,1,1,1,60,"['uR0001', 'uR0002', 'uR0003', 'uR0007', 'uR00..."
2,2,1,1,90,"['uR0001', 'uR0002', 'uR0003', 'uR0007', 'uR00..."
3,3,1,1,120,"['uR0001', 'uR0002', 'uR0003', 'uR0007', 'uR00..."
4,4,1,1,150,"['uR0001', 'uR0002', 'uR0003', 'uR0007', 'uR00..."


In [6]:
from tqdm import tqdm
from sklearn.model_selection import  train_test_split
writer = tf.summary.create_file_writer('log/filtered_sampling_test')
result = []

with writer.as_default():
    for j in tqdm(range(0, 1030)):
        current = filtered_users.iloc[j]
        current_user_list = eval(current.user_list)

        #get distribution of current filtered list
        avg_dist = {}
        current_tpds = tpd_pairs[tpd_pairs['user id'].isin(set(current_user_list))]
        for avg in current_tpds['tweets per day'].unique():
            tmp_dist = float((current_tpds[current_tpds['tweets per day']==avg].count()/len(current_tpds)).iloc[0])
            avg_dist.update({str(avg): str(tmp_dist)})

        dist_df = pd.DataFrame(avg_dist.items(), columns=['tweets per day', 'percentage'])
        dist_df = dist_df.astype({'tweets per day': 'int32'})
        dist_df = dist_df.astype({'percentage': 'float'})
        dist_df = dist_df.sort_values(by=['tweets per day'])

        #downsample current user list based on distribution
        current_samples = pd.DataFrame(columns=['user id', 'tweets per day'])
        for tpd in current_tpds['tweets per day'].unique():
            tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
            frac = float(dist_df.loc[dist_df['tweets per day']==tpd]['percentage'].iloc[0])
            if int(n_depression*frac)<len(tmp_df):
                sampled_tmp_df = tmp_df.sample(n=int(n_depression*frac), random_state=42)
            else:
                sampled_tmp_df = tmp_df
            current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)

        present_tpd = current_samples['tweets per day'].unique().tolist()
        all_tpd = dist_df['tweets per day'].unique().tolist()
        diff_tpd = list(set(present_tpd).symmetric_difference(set(all_tpd)))
        dist_df = dist_df.sort_values('percentage', ascending=False)

        needed_tpd = []
        counter = n_depression-len(current_samples['user id'].unique())
        if counter > 0:
            for tpd in dist_df['tweets per day'].unique():
                if tpd in diff_tpd and counter > 0:
                    counter = counter - 1
                    tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
                    sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
                    current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)

        #upsample/downsample depression cohort accordingly
        data = {}
        labels = []
        for user in dt.user_id.unique():
            tmp = dt[dt['user_id'] == user]
            tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
            data.update({''+user+'': tweet_list})
            labels.append(1)

        for user in current_samples['user id'].unique():
            tmp = st[st['user_id'] == user]
            tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
            data.update({''+user+'': tweet_list})
            labels.append(0)

        df_data = pd.DataFrame(dict([(key, pd.Series(value)) for key, value in data.items()]))
        df_labels = pd.DataFrame(labels)

        df_data = df_data.transpose()
        #df_data = df_data.drop(['Unnamed: 0'])
        #df_labels = df_labels.drop(columns=['Unnamed: 0'])
        df_data.index = range(0, len(df_data.index))
        df_data.sort_index()
        df_data = df_data.fillna(0)
        X_train, X_test, y_train, y_test  = train_test_split(df_data, df_labels, test_size=0.2)

        #run ml and safe result with filters
        rocket = Rocket()
        #rocket.fit(X_train)
        #X_train_transform = rocket.transform(X_train)
        classifier = RidgeClassifierCV(alphas=np.logspace(-3,3,10))
        classifier.fit(X_train, y_train)
        #print(X_test)
        #X_test_transform = rocket.transform(X_test)
        X_test = X_test.replace(-float('inf'), -sys.float_info.min)
        score = classifier.score(X_test, y_test)
        tf.summary.scalar('score', score, j)

        result.append({'min_tpd': current['min_tpd'], 'max_days_bt': current['max_days_bt'], 'min_days_active': current['min_days_active'], 'user_list': current_user_list, 'result': score})

df_filtered = pd.DataFrame(result)
df_filtered.to_csv('tuning_results_new_split.csv')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [3]:
df_filtered = pd.DataFrame(result)
df_filtered.to_csv('tuning_results.csv')

NameError: name 'result' is not defined

In [8]:
df_filtered['result'].sort_values(ascending=False)

771    0.843750
800    0.828125
834    0.820312
791    0.789062
825    0.781250
         ...   
3      0.475207
964    0.471074
847    0.462810
841    0.462810
963    0.458678
Name: result, Length: 1030, dtype: float64

In [11]:
df_filtered.iloc[771]

min_tpd                                                           10
max_days_bt                                                        2
min_days_active                                                  120
user_list          [uR0036, uR0129, uR0134, uR0951, uR1019, uR103...
result                                                       0.84375
Name: 771, dtype: object

In [10]:
df_filtered

Unnamed: 0,min_tpd,max_days_bt,min_days_active,user_list,result
0,1,1,30,"[uR0001, uR0002, uR0003, uR0007, uR0011, uR001...",0.561983
1,1,1,60,"[uR0001, uR0002, uR0003, uR0007, uR0011, uR001...",0.564315
2,1,1,90,"[uR0001, uR0002, uR0003, uR0007, uR0011, uR001...",0.625000
3,1,1,120,"[uR0001, uR0002, uR0003, uR0007, uR0011, uR001...",0.475207
4,1,1,150,"[uR0001, uR0002, uR0003, uR0007, uR0011, uR001...",0.595041
...,...,...,...,...,...
1025,10,3,1,"[uR0010, uR0011, uR0014, uR0015, uR0016, uR002...",0.641667
1026,10,4,1,"[uR0010, uR0011, uR0014, uR0015, uR0016, uR002...",0.658333
1027,10,5,1,"[uR0010, uR0011, uR0014, uR0015, uR0016, uR002...",0.620833
1028,10,6,1,"[uR0010, uR0011, uR0014, uR0015, uR0016, uR002...",0.645833
