This notebook handles the hyperparameter tuning for the supervised learning classifiers and additionally the random forest and ada boost classifier.

In [1]:
import pandas as pd
from sktime.classification.hybrid import HIVECOTEV2
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import tensorflow as tf
from sklearn.metrics import precision_recall_fscore_support
from sklearn.neighbors import KNeighborsClassifier
import time

In [2]:
#depression tweets
dt = pd.read_csv("data/depression_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#dt["created_at"] = pd.to_datetime(dt["created_at"])
n_depression = len(dt['user_id'].unique())
print(n_depression)

#sample tweets
st = pd.read_csv("data/sample_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#st["created_at"] = pd.to_datetime(st["created_at"])

#filtered users
fu = pd.read_csv('filtered_user_lists.csv')
fu_tbt = pd.read_csv('filtered_user_lists_only_tpd_tbt.csv')
fu_maxtime = pd.read_csv('filtered_user_lists_only_tpd_maxtime.csv')
#tpd pairs
tpd_pairs = pd.read_csv('pairs_tpd_users.csv')
filtered_users = pd.concat([fu, fu_tbt, fu_maxtime], ignore_index=True)

603


In [3]:
j = 948
current = filtered_users.iloc[j]
print(current)
current_user_list = eval(current.user_list)

#get distribution of current filtered list
avg_dist = {}
current_tpds = tpd_pairs[tpd_pairs['user id'].isin(set(current_user_list))]
for avg in current_tpds['tweets per day'].unique():
    tmp_dist = float((current_tpds[current_tpds['tweets per day']==avg].count()/len(current_tpds)).iloc[0])
    avg_dist.update({str(avg): str(tmp_dist)})

dist_df = pd.DataFrame(avg_dist.items(), columns=['tweets per day', 'percentage'])
dist_df = dist_df.astype({'tweets per day': 'int32'})
dist_df = dist_df.astype({'percentage': 'float'})
dist_df = dist_df.sort_values(by=['tweets per day'])

#downsample current user list based on distribution
current_samples = pd.DataFrame(columns=['user id', 'tweets per day'])
for tpd in current_tpds['tweets per day'].unique():
    tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
    frac = float(dist_df.loc[dist_df['tweets per day']==tpd]['percentage'].iloc[0])
    if int(n_depression*frac)<len(tmp_df):
        sampled_tmp_df = tmp_df.sample(n=int(n_depression*frac), random_state=42)
    else:
        sampled_tmp_df = tmp_df
    current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)

present_tpd = current_samples['tweets per day'].unique().tolist()
all_tpd = dist_df['tweets per day'].unique().tolist()
diff_tpd = list(set(present_tpd).symmetric_difference(set(all_tpd)))
dist_df = dist_df.sort_values('percentage', ascending=False) #sort percentages here

#fill up remaining spots with random users from rs cohort not yet present in sampled users starting with most occuring tpd class in filtered user list (all tpd not already in sampled list)
needed_tpd = []
counter = n_depression-len(current_samples['user id'].unique())
if counter > 0:
    for tpd in dist_df['tweets per day'].unique():
        if tpd in diff_tpd and counter > 0:
            tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
            #make sure to not sample a user again
            sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
            if current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                unique = False
                stop = len(current_tpds['user id'].unique())
                while (not unique) or (stop>0):
                    sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
                    if not current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                        unique = True
                    stop = stop - 1

            current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)
            counter = counter - 1
print(counter)
print(len(current_samples['user id'].unique()))
#if still space left, fill again, this time not caring if tpd is already in final list with samples from all users in rs cohort, with disregard to filtered users

current_user_list = list(set(st.user_id.unique()).symmetric_difference(set(current_samples['user id'].unique())))

avg_dist = {}
current_tpds = tpd_pairs[tpd_pairs['user id'].isin(set(current_user_list))]
for avg in current_tpds['tweets per day'].unique():
    tmp_dist = float((current_tpds[current_tpds['tweets per day']==avg].count()/len(current_tpds)).iloc[0])
    avg_dist.update({str(avg): str(tmp_dist)})

dist_df = pd.DataFrame(avg_dist.items(), columns=['tweets per day', 'percentage'])
dist_df = dist_df.astype({'tweets per day': 'int32'})
dist_df = dist_df.astype({'percentage': 'float'})
dist_df = dist_df.sort_values(by=['tweets per day'])

for tpd in current_tpds['tweets per day'].unique():
    tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
    frac = float(dist_df.loc[dist_df['tweets per day']==tpd]['percentage'].iloc[0])
    if int(n_depression*frac)<len(tmp_df):
        sampled_tmp_df = tmp_df.sample(n=int(n_depression*frac), random_state=42)
    else:
        sampled_tmp_df = tmp_df
    current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)

present_tpd = current_samples['tweets per day'].unique().tolist()
all_tpd = dist_df['tweets per day'].unique().tolist()
diff_tpd = list(set(present_tpd).symmetric_difference(set(all_tpd)))
dist_df = dist_df.sort_values('percentage', ascending=False) #sort percentages here

needed_tpd = []
counter = n_depression-len(current_samples['user id'].unique())
while counter > 0:
    for tpd in dist_df['tweets per day'].unique():
        if tpd in diff_tpd and counter > 0:
            tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
            #make sure to not sample a user again
            sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
            if current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                unique = False
                stop = len(current_tpds['user id'].unique())
                while (not unique) or (stop>0):
                    sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
                    if not current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                        unique = True
                    stop = stop - 1

            current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)
            counter = counter - 1

print(len(current_samples['user id'].unique()))
#assamble data
data = {}
labels = []
for user in dt.user_id.unique():
    tmp = dt[dt['user_id'] == user]
    tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
    data.update({''+user+'': tweet_list})
    labels.append(1)

for user in current_samples['user id'].unique():
    tmp = st[st['user_id'] == user]
    tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
    data.update({''+user+'': tweet_list})
    labels.append(0)

df_data = pd.DataFrame(dict([(key, pd.Series(value)) for key, value in data.items()]))
df_labels = pd.DataFrame(labels)

Unnamed: 0                                                       108
min_tpd                                                           10
max_days_bt                                                        1
min_days_active                                                   30
user_list          ['uR0011', 'uR0014', 'uR0015', 'uR0022', 'uR00...
Name: 948, dtype: object
16
587
1156


In [5]:
from sktime.forecasting.model_selection import temporal_train_test_split
from sklearn.model_selection import train_test_split

df_data = df_data.transpose()
df_data.index = range(0, len(df_data.index))
df_data.sort_index()
df_data = df_data.fillna(0)
X_train, X_test, y_train, y_test = train_test_split(df_data, df_labels, test_size=0.2)
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3216,3217,3218,3219,3220,3221,3222,3223,3224,3225
1049,1.549336e+09,1.549336e+09,1.549336e+09,1.549318e+09,1.549318e+09,1.549316e+09,1.549315e+09,1.549249e+09,1.549249e+09,1.549248e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
614,1.549476e+09,1.549396e+09,1.549244e+09,1.549243e+09,1.549154e+09,1.549153e+09,1.549153e+09,1.549147e+09,1.549087e+09,1.549087e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,1.536640e+09,1.536639e+09,1.536632e+09,1.536625e+09,1.536584e+09,1.536521e+09,1.536447e+09,1.536446e+09,1.536432e+09,1.536358e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
847,1.549396e+09,1.549394e+09,1.549394e+09,1.549393e+09,1.549392e+09,1.549389e+09,1.549389e+09,1.549389e+09,1.549382e+09,1.549381e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
430,1.527172e+09,1.527172e+09,1.527161e+09,1.527161e+09,1.527157e+09,1.527071e+09,1.527022e+09,1.527022e+09,1.527020e+09,1.527020e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,1.549511e+09,1.549510e+09,1.549509e+09,1.549500e+09,1.549500e+09,1.549497e+09,1.549497e+09,1.549496e+09,1.549495e+09,1.549495e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
221,1.536332e+09,1.536290e+09,1.536290e+09,1.536289e+09,1.536289e+09,1.536289e+09,1.536288e+09,1.536125e+09,1.536117e+09,1.536019e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538,1.526404e+09,1.526397e+09,1.526159e+09,1.526159e+09,1.526158e+09,1.526072e+09,1.525808e+09,1.525807e+09,1.525807e+09,1.525807e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1172,1.549519e+09,1.549515e+09,1.549514e+09,1.549514e+09,1.549514e+09,1.549514e+09,1.549514e+09,1.549514e+09,1.549513e+09,1.549512e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#GNB
from sklearn.naive_bayes import GaussianNB
from tensorboard.plugins.hparams import api as hp
import uuid

HP_VAR_SMOOTHING = hp.HParam('var_smoothing', hp.RealInterval(0.0000000001 ,0.000000002))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_user_group_948/classifier_test_GNB_full_metrics_new_split').as_default():
  hp.hparams_config(
    hparams=[HP_VAR_SMOOTHING],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )

var_list = np.arange(0.0000000001,0.0000000021,0.0000000001)



for v in var_list:
            id = uuid.uuid4()
            hparams = {HP_VAR_SMOOTHING: v}
            writer = tf.summary.create_file_writer('final_log_user_group_948/classifier_test_GNB_full_metrics_new_split/'+str(id))
            with writer.as_default():
                classifier_pipe = make_pipeline(StandardScaler(), GaussianNB(var_smoothing=v))

                start = time.perf_counter()
                classifier_pipe.fit(X_train, y_train)
                end = time.perf_counter()
                train_time = end-start

                score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

                start = time.perf_counter()
                y_pred = classifier_pipe.predict(X_test)
                end = time.perf_counter()
                test_time = end-start

                precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
                hp.hparams(hparams)

                tf.summary.scalar(METRIC_F1, f1, step=1)
                tf.summary.scalar(METRIC_SCORE, score, step=1)
                tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                tf.summary.scalar(METRIC_RECALL, recall, step=1)
                tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [7]:
#QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from tensorboard.plugins.hparams import api as hp
import uuid
HP_REG_PARAM = hp.HParam('reg_param', hp.RealInterval(0.0,1.))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_user_group_948/classifier_test_QDA_full_metrics_new_split').as_default():
  hp.hparams_config(
    hparams=[HP_REG_PARAM],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )

reg_list = np.arange(0.0, 1.1, 0.1)

for r in reg_list:
            id = uuid.uuid4()
            hparams = {HP_REG_PARAM: r}
            writer = tf.summary.create_file_writer('final_log_user_group_948/classifier_test_QDA_full_metrics_new_split/'+str(id))
            with writer.as_default():
                classifier_pipe = make_pipeline(StandardScaler(), QuadraticDiscriminantAnalysis(reg_param=r))

                start = time.perf_counter()
                classifier_pipe.fit(X_train, y_train)
                end = time.perf_counter()
                train_time = end-start

                score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

                start = time.perf_counter()
                y_pred = classifier_pipe.predict(X_test)
                end = time.perf_counter()
                test_time = end-start

                precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
                hp.hparams(hparams)

                tf.summary.scalar(METRIC_F1, f1, step=1)
                tf.summary.scalar(METRIC_SCORE, score, step=1)
                tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                tf.summary.scalar(METRIC_RECALL, recall, step=1)
                tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [5]:
#SVM
from tensorboard.plugins.hparams import api as hp
import uuid

HP_C = hp.HParam('c_values', hp.RealInterval(0.1,5.))
HP_GAMMA = hp.HParam('gamma', hp.Discrete(['scale', 'auto']))
HP_KERNEL = hp.HParam('kernel', hp.Discrete(['linear', 'poly', 'rbf', 'sigmoid']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_user_group_948/classifier_test_svm_full_metrics_new_split').as_default():
  hp.hparams_config(
    hparams=[HP_C,
             HP_GAMMA,
             HP_KERNEL],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )


#writer = tf.summary.create_file_writer('log/classifier_test_test')

c_list = np.arange(0.1, 5.1, 0.1)
kernel_list = ['linear', 'poly', 'rbf', 'sigmoid']
gamma_list = ['scale', 'auto']


for c in c_list:
    for k in kernel_list:
        for g in gamma_list:
            id = uuid.uuid4()
            hparams = {HP_C: c, HP_KERNEL: k, HP_GAMMA: g}
            writer = tf.summary.create_file_writer('final_log_user_group_948/classifier_test_svm_full_metrics_new_split/'+str(id))
            with writer.as_default():
                classifier_pipe = make_pipeline(StandardScaler(), SVC(C=c, kernel=k, gamma=g))

                start = time.perf_counter()
                classifier_pipe.fit(X_train, y_train)
                end = time.perf_counter()
                train_time = end-start

                score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

                start = time.perf_counter()
                y_pred = classifier_pipe.predict(X_test)
                end = time.perf_counter()
                test_time = end-start

                precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
                hp.hparams(hparams)

                tf.summary.scalar(METRIC_F1, f1, step=1)
                tf.summary.scalar(METRIC_SCORE, score, step=1)
                tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                tf.summary.scalar(METRIC_RECALL, recall, step=1)
                tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)



#svm

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, l

In [6]:
#knn
import sklearn
HP_NEIGHBORS = hp.HParam('neighbors', hp.RealInterval(1.,10.))
HP_WEIGHTS = hp.HParam('weights', hp.Discrete(['uniform', 'distance']))
HP_ALGO = hp.HParam('algorithm', hp.Discrete(['auto', 'ball_tree', 'kd_tree', 'brute']))
HP_LEAF = hp.HParam('leaf_size', hp.RealInterval(1.,60.))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_user_group_948/classifier_test_knn_full_metrics_new_split_cosine').as_default():
  hp.hparams_config(
    hparams=[HP_NEIGHBORS,
             HP_WEIGHTS,
             HP_ALGO,
             HP_LEAF],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )

n_list = np.arange(1,10,1)
weight_list = ['uniform', 'distance']
alog_list = ['auto', 'ball_tree', 'kd_tree', 'brute']
leaf_list = np.arange(1,60,1)
#p_list = np.arange(1,2,1)


for n in n_list:
    for w in weight_list:
        for a in alog_list:
            for l in leaf_list:
                id = uuid.uuid4()
                hparams = {HP_NEIGHBORS: n, HP_WEIGHTS: w, HP_ALGO: a, HP_LEAF: l}
                writer = tf.summary.create_file_writer('final_log_user_group_948/classifier_test_knn_full_metrics_new_split_cosine/'+str(id))
                if a != 'auto' and 'cosine' not in sklearn.neighbors.VALID_METRICS[a]:
                    continue
                with writer.as_default():
                    classifier_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=n, weights=w, algorithm=a, leaf_size=l, metric='cosine'))

                    start = time.perf_counter()
                    classifier_pipe.fit(X_train, y_train)
                    end = time.perf_counter()
                    train_time = end-start

                    score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

                    start = time.perf_counter()
                    y_pred = classifier_pipe.predict(X_test)
                    end = time.perf_counter()
                    test_time = end-start

                    precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
                    hp.hparams(hparams)
                    tf.summary.scalar(METRIC_F1, f1, step=1)
                    tf.summary.scalar(METRIC_SCORE, score, step=1)
                    tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                    tf.summary.scalar(METRIC_RECALL, recall, step=1)
                    tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                    tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)


#knn


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [7]:
#GPC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct

rbf = RBF()
matern = Matern()
rqk = RationalQuadratic()
ess = ExpSineSquared()
dpk = DotProduct()


HP_RESTARTS = hp.HParam('optimizer_restarts', hp.RealInterval(0.,5.))
HP_ITER = hp.HParam('max_iterations', hp.RealInterval(50.,150.))
HP_KERNELS = hp.HParam('kernels', hp.Discrete(['rbf', 'matern', 'rqk', 'dpk']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_user_group_948/classifier_test_GPC_full_metrics_new_split_kernels').as_default():
  hp.hparams_config(
    hparams=[HP_RESTARTS,
             HP_ITER,
             HP_KERNELS],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )

restart_list = np.arange(0,5,1)
iter_list = np.arange(50,150,5)
kernel_list = [rbf, matern, rqk, dpk]
kernel_names = ['rbf', 'matern', 'rqk', 'dpk']

for r in restart_list:
    for j in iter_list:
        for k in range(len(kernel_list)):
            id = uuid.uuid4()
            hparams = {HP_RESTARTS: r, HP_ITER: j, HP_KERNELS: kernel_names[k]}
            writer = tf.summary.create_file_writer('final_log_user_group_948/classifier_test_GPC_full_metrics_new_split_kernels/'+str(id))
            with writer.as_default():
                classifier_pipe = make_pipeline(StandardScaler(), GaussianProcessClassifier(kernel = kernel_list[k], n_restarts_optimizer=r, max_iter_predict=j, random_state=42))

                start = time.perf_counter()
                classifier_pipe.fit(X_train, y_train)
                end = time.perf_counter()
                train_time = end-start

                score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

                start = time.perf_counter()
                y_pred = classifier_pipe.predict(X_test)
                end = time.perf_counter()
                test_time = end-start

                precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
                hp.hparams(hparams)
                tf.summary.scalar(METRIC_F1, f1, step=1)
                tf.summary.scalar(METRIC_SCORE, score, step=1)
                tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                tf.summary.scalar(METRIC_RECALL, recall, step=1)
                tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)



#GPC

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)

In [8]:
#DTC
from tensorboard.plugins.hparams import api as hp
import uuid
from sklearn.tree import DecisionTreeClassifier

HP_CRITERION = hp.HParam('criterion', hp.Discrete(['gini', 'entropy', 'log_loss']))
HP_SPLITTER = hp.HParam('splitter', hp.Discrete(['best', 'random']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_user_group_948/classifier_test_DTC_full_metrics_new_split').as_default():
  hp.hparams_config(
    hparams=[HP_CRITERION,
             HP_SPLITTER],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )

criterion_list = ['gini', 'entropy', 'log_loss']
splitter_list = ['best', 'random']
max_features_list = ['auto', 'sqrt', 'log2', 'None']

for c in criterion_list:
    for s in splitter_list:
        id = uuid.uuid4()
        hparams = {HP_CRITERION: c, HP_SPLITTER: s}
        writer = tf.summary.create_file_writer('final_log_user_group_948/classifier_test_DTC_full_metrics_new_split/'+str(id))
        with writer.as_default():
            classifier_pipe = make_pipeline(StandardScaler(), DecisionTreeClassifier(criterion=c, splitter=s, random_state=42))

            start = time.perf_counter()
            classifier_pipe.fit(X_train, y_train)
            end = time.perf_counter()
            train_time = end-start

            score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

            start = time.perf_counter()
            y_pred = classifier_pipe.predict(X_test)
            end = time.perf_counter()
            test_time = end-start

            precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
            hp.hparams(hparams)
            tf.summary.scalar(METRIC_F1, f1, step=1)
            tf.summary.scalar(METRIC_SCORE, score, step=1)
            tf.summary.scalar(METRIC_PRECISION, precision, step=1)
            tf.summary.scalar(METRIC_RECALL, recall, step=1)
            tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
            tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)




#DTC

In [9]:
#RFC
from sklearn.ensemble import RandomForestClassifier

HP_ESTIMATORS = hp.HParam('estimators', hp.RealInterval(10.,150.))
HP_RFC_CRITERION = hp.HParam('criterion', hp.Discrete(['gini', 'entropy', 'log_loss']))
HP_RFC_MAX_FEATURES = hp.HParam('max_features', hp.Discrete(['sqrt', 'log2']))
HP_CLASS_WEIGHT = hp.HParam('class_weights', hp.Discrete(['None', 'balanced', 'balanced_subsample']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_user_group_948/classifier_test_RFC_full_metrics_new_split').as_default():
  hp.hparams_config(
    hparams=[HP_ESTIMATORS,
             HP_RFC_CRITERION,
             HP_RFC_MAX_FEATURES,
             HP_CLASS_WEIGHT],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )

estimator_list = np.arange(10,150,5)
criterion_list = ['gini', 'entropy', 'log_loss']
max_features_list = ['sqrt', 'log2']
weight_list = ['balanced', 'balanced_subsample']

for e in estimator_list:
    for c in criterion_list:
        for f in max_features_list:
            for w in weight_list:
                id = uuid.uuid4()
                hparams = {HP_ESTIMATORS: e, HP_RFC_CRITERION: c, HP_RFC_MAX_FEATURES: f, HP_CLASS_WEIGHT: w}
                writer = tf.summary.create_file_writer('final_log_user_group_948/classifier_test_RFC_full_metrics_new_split/'+str(id))
                with writer.as_default():
                    classifier_pipe = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=e, criterion=c, max_features=f, class_weight=w, random_state=42))

                    start = time.perf_counter()
                    classifier_pipe.fit(X_train, y_train)
                    end = time.perf_counter()
                    train_time = end-start

                    score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

                    start = time.perf_counter()
                    y_pred = classifier_pipe.predict(X_test)
                    end = time.perf_counter()
                    test_time = end-start

                    precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
                    hp.hparams(hparams)
                    tf.summary.scalar(METRIC_F1, f1, step=1)
                    tf.summary.scalar(METRIC_SCORE, score, step=1)
                    tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                    tf.summary.scalar(METRIC_RECALL, recall, step=1)
                    tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                    tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)



#RFC

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

In [10]:
#GNB
from sklearn.naive_bayes import GaussianNB

classifier_pipe = make_pipeline(StandardScaler(), GaussianNB())

start = time.perf_counter()
classifier_pipe.fit(X_train, y_train)
end = time.perf_counter()
train_time = end-start

score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

start = time.perf_counter()
y_pred = classifier_pipe.predict(X_test)
end = time.perf_counter()
test_time = end-start

precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
print(score)
print(precision)
print(recall)
print(f1)
print(train_time)
print(test_time)


#GNB 1:2 ratio

  y = column_or_1d(y, warn=True)


0.5056818181818182
0.42517006802721086
0.9615384615384616
0.5896226415094339
0.19409819995053113
0.06190249999053776


In [11]:
#QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifier_pipe = make_pipeline(StandardScaler(), QuadraticDiscriminantAnalysis())

start = time.perf_counter()
classifier_pipe.fit(X_train, y_train)
end = time.perf_counter()
train_time = end-start

score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

start = time.perf_counter()
y_pred = classifier_pipe.predict(X_test)
end = time.perf_counter()
test_time = end-start

precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
print(score)
print(precision)
print(recall)
print(f1)
print(train_time)
print(test_time)


#QDA 1:2 ratio

  y = column_or_1d(y, warn=True)


0.4943181818181818
0.4117647058823529
0.8615384615384616
0.5572139303482587
2.660447100060992
0.04880360001698136


In [16]:
#ADB
from tensorboard.plugins.hparams import api as hp
import uuid
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

clf1 = SVC(C = 4.9,gamma='auto', kernel='rbf', random_state=42)
clf2 = DecisionTreeClassifier(criterion='log_loss', splitter='random', max_features='auto', random_state=42)

HP_ADB_N_ESTIMATOR = hp.HParam('n estimator', hp.RealInterval(25.,75.))
HP_ADB_LEARNING_RATE = hp.HParam('learning_rate', hp.RealInterval(1.,5.))
HP_ADB_ALGO = hp.HParam('algorithm', hp.Discrete(['SAMME', 'SAMME.R']))
HP_ADB_ESTIMATOR = hp.HParam('estimator', hp.Discrete(['SVC', 'DTC']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_user_group_948/classifier_test_ADB_full_metrics_new_split').as_default():
  hp.hparams_config(
    hparams=[HP_ADB_ESTIMATOR,
             HP_ADB_N_ESTIMATOR,
             HP_ADB_LEARNING_RATE,
             HP_ADB_ALGO],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )

n_estimator_list = np.arange(25, 75, 5)
learning_rate_list = np.arange(1., 5., 0.2)
algo_list = ['SAMME', 'SAMME.R']
estimator_list = [clf1, clf2]
estimator_name_list = ['SVC', 'DTC']

for c in range(len(estimator_list)):
    for e in n_estimator_list:
        for l in learning_rate_list:
            for a in algo_list:
                if(estimator_name_list[c]=='SVC' and a =='SAMME.R'):
                    continue
                id = uuid.uuid4()
                hparams = {HP_ADB_ESTIMATOR: estimator_name_list[c], HP_ADB_N_ESTIMATOR: e, HP_ADB_LEARNING_RATE: l, HP_ADB_ALGO: a}
                writer = tf.summary.create_file_writer('final_log_user_group_948/classifier_test_ADB_full_metrics_new_split/'+str(id))
                with writer.as_default():
                    classifier_pipe = make_pipeline(StandardScaler(), AdaBoostClassifier(estimator=estimator_list[c] ,n_estimators=e, learning_rate=l, algorithm=a, random_state=42))

                    start = time.perf_counter()
                    classifier_pipe.fit(X_train, y_train)
                    end = time.perf_counter()
                    train_time = end-start

                    score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

                    start = time.perf_counter()
                    y_pred = classifier_pipe.predict(X_test)
                    end = time.perf_counter()
                    test_time = end-start

                    precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
                    hp.hparams(hparams)
                    tf.summary.scalar(METRIC_F1, f1, step=1)
                    tf.summary.scalar(METRIC_SCORE, score, step=1)
                    tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                    tf.summary.scalar(METRIC_RECALL, recall, step=1)
                    tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                    tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)



#ADB

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)

In [17]:
#MLP
from sklearn.neural_network import MLPClassifier
from tensorboard.plugins.hparams import api as hp
import uuid

HP_ACTIVATION = hp.HParam('activation', hp.Discrete(['identity', 'logistic', 'tanh', 'relu']))
HP_SOLVER = hp.HParam('solver', hp.Discrete(['lbfgs', 'sgd', 'adam']))
HP_ALPHA = hp.HParam('alpha', hp.RealInterval(0.0001, 1.))
HP_LEARNING_RATE = hp.HParam('learning_rate', hp.Discrete(['constant', 'invscaling', 'adaptive']))
HP_LEARNING_RATE_INIT = hp.HParam('learning_rate_init', hp.RealInterval(0.001, 1.))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_user_group_948/classifier_test_MLP_smaller_metrics_new_split').as_default():
  hp.hparams_config(
    hparams=[HP_ACTIVATION,
             HP_SOLVER,
             HP_ALPHA,
             HP_LEARNING_RATE,
             HP_LEARNING_RATE_INIT],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )

activation_list = ['identity', 'logistic', 'tanh', 'relu']
solver_list = ['lbfgs', 'sgd', 'adam']
alpha_list = [0.0001, 0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.005, 0.0075, 0.01, 0.015, 0.02, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 1]
learning_rate_list = ['constant', 'invscaling', 'adaptive']
learning_rate_init_list = [0.001, 0.0015, 0.002, 0.0025, 0.005, 0.0075, 0.01, 0.015, 0.02, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 1]

for a in activation_list:
    for s in solver_list:
        for al in alpha_list:
            for l in learning_rate_list:
                for r in learning_rate_init_list:
                    id = uuid.uuid4()
                    hparams = {HP_ACTIVATION: a, HP_SOLVER: s, HP_ALPHA: al, HP_LEARNING_RATE: l, HP_LEARNING_RATE_INIT: r}
                    writer = tf.summary.create_file_writer('final_log_user_group_948/classifier_test_MLP_smaller_metrics_new_split/'+str(id))
                    with writer.as_default():
                        classifier_pipe = make_pipeline(StandardScaler(), MLPClassifier(activation=a, solver=s, alpha=al, learning_rate=l, learning_rate_init=r, random_state=42))

                        start = time.perf_counter()
                        classifier_pipe.fit(X_train, y_train)
                        end = time.perf_counter()
                        train_time = end-start

                        score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

                        start = time.perf_counter()
                        y_pred = classifier_pipe.predict(X_test)
                        end = time.perf_counter()
                        test_time = end-start

                        precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
                        hp.hparams(hparams)
                        tf.summary.scalar(METRIC_F1, f1, step=1)
                        tf.summary.scalar(METRIC_SCORE, score, step=1)
                        tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                        tf.summary.scalar(METRIC_RECALL, recall, step=1)
                        tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                        tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)



#MLP

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or 

ValueError: Solver produced non-finite parameter weights. The input data may contain large values and need to be preprocessed.

In [8]:
# now with n_depression *2 for double rs size
#depression tweets
dt = pd.read_csv("data/depression_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#dt["created_at"] = pd.to_datetime(dt["created_at"])
n_depression = len(dt['user_id'].unique())*2
print(n_depression)

#sample tweets
st = pd.read_csv("data/sample_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#st["created_at"] = pd.to_datetime(st["created_at"])

#filtered users
fu = pd.read_csv('filtered_user_lists.csv')
fu_tbt = pd.read_csv('filtered_user_lists_only_tpd_tbt.csv')
fu_maxtime = pd.read_csv('filtered_user_lists_only_tpd_maxtime.csv')
#tpd pairs
tpd_pairs = pd.read_csv('pairs_tpd_users.csv')
filtered_users = pd.concat([fu, fu_tbt, fu_maxtime], ignore_index=True)
#-------------------------------------------------------------------
j = 948
current = filtered_users.iloc[j]
print(current)
current_user_list = eval(current.user_list)

#get distribution of current filtered list
avg_dist = {}
current_tpds = tpd_pairs[tpd_pairs['user id'].isin(set(current_user_list))]
for avg in current_tpds['tweets per day'].unique():
    tmp_dist = float((current_tpds[current_tpds['tweets per day']==avg].count()/len(current_tpds)).iloc[0])
    avg_dist.update({str(avg): str(tmp_dist)})

dist_df = pd.DataFrame(avg_dist.items(), columns=['tweets per day', 'percentage'])
dist_df = dist_df.astype({'tweets per day': 'int32'})
dist_df = dist_df.astype({'percentage': 'float'})
dist_df = dist_df.sort_values(by=['tweets per day'])

#downsample current user list based on distribution
current_samples = pd.DataFrame(columns=['user id', 'tweets per day'])
for tpd in current_tpds['tweets per day'].unique():
    tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
    frac = float(dist_df.loc[dist_df['tweets per day']==tpd]['percentage'].iloc[0])
    if int(n_depression*frac)<len(tmp_df):
        sampled_tmp_df = tmp_df.sample(n=int(n_depression*frac), random_state=42)
    else:
        sampled_tmp_df = tmp_df
    current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)

present_tpd = current_samples['tweets per day'].unique().tolist()
all_tpd = dist_df['tweets per day'].unique().tolist()
diff_tpd = list(set(present_tpd).symmetric_difference(set(all_tpd)))
dist_df = dist_df.sort_values('percentage', ascending=False) #sort percentages here

#fill up remaining spots with random users from rs cohort not yet present in sampled users starting with most occuring tpd class in filtered user list (all tpd not already in sampled list)
needed_tpd = []
counter = n_depression-len(current_samples['user id'].unique())
if counter > 0:
    for tpd in dist_df['tweets per day'].unique():
        if tpd in diff_tpd and counter > 0:
            tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
            #make sure to not sample a user again
            sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
            if current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                unique = False
                stop = len(current_tpds['user id'].unique())
                while (not unique) or (stop>0):
                    sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
                    if not current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                        unique = True
                    stop = stop - 1

            current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)
            counter = counter - 1
print(counter)
print(len(current_samples['user id'].unique()))
#if still space left, fill again, this time not caring if tpd is already in final list with samples from all users in rs cohort, with disregard to filtered users

current_user_list = list(set(st.user_id.unique()).symmetric_difference(set(current_samples['user id'].unique())))

avg_dist = {}
current_tpds = tpd_pairs[tpd_pairs['user id'].isin(set(current_user_list))]
for avg in current_tpds['tweets per day'].unique():
    tmp_dist = float((current_tpds[current_tpds['tweets per day']==avg].count()/len(current_tpds)).iloc[0])
    avg_dist.update({str(avg): str(tmp_dist)})

dist_df = pd.DataFrame(avg_dist.items(), columns=['tweets per day', 'percentage'])
dist_df = dist_df.astype({'tweets per day': 'int32'})
dist_df = dist_df.astype({'percentage': 'float'})
dist_df = dist_df.sort_values(by=['tweets per day'])

for tpd in current_tpds['tweets per day'].unique():
    tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
    frac = float(dist_df.loc[dist_df['tweets per day']==tpd]['percentage'].iloc[0])
    if int(n_depression*frac)<len(tmp_df):
        sampled_tmp_df = tmp_df.sample(n=int(n_depression*frac), random_state=42)
    else:
        sampled_tmp_df = tmp_df
    current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)

present_tpd = current_samples['tweets per day'].unique().tolist()
all_tpd = dist_df['tweets per day'].unique().tolist()
diff_tpd = list(set(present_tpd).symmetric_difference(set(all_tpd)))
dist_df = dist_df.sort_values('percentage', ascending=False) #sort percentages here

needed_tpd = []
counter = n_depression-len(current_samples['user id'].unique())
while counter > 0:
    for tpd in dist_df['tweets per day'].unique():
        if tpd in diff_tpd and counter > 0:
            tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
            #make sure to not sample a user again
            sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
            if current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                unique = False
                stop = len(current_tpds['user id'].unique())
                while (not unique) or (stop>0):
                    sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
                    if not current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                        unique = True
                    stop = stop - 1

            current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)
            counter = counter - 1

print(len(current_samples['user id'].unique()))
#assamble data
data = {}
labels = []
for user in dt.user_id.unique():
    tmp = dt[dt['user_id'] == user]
    tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
    data.update({''+user+'': tweet_list})
    labels.append(1)

for user in current_samples['user id'].unique():
    tmp = st[st['user_id'] == user]
    tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
    data.update({''+user+'': tweet_list})
    labels.append(0)

df_data = pd.DataFrame(dict([(key, pd.Series(value)) for key, value in data.items()]))
df_labels = pd.DataFrame(labels)
#----------------------------------------------------
from sklearn.model_selection import train_test_split

df_data = df_data.transpose()
df_data.index = range(0, len(df_data.index))
df_data.sort_index()
df_data = df_data.fillna(0)
X_train, X_test, y_train, y_test = train_test_split(df_data, df_labels, test_size=0.2)
#---------------------------------------------------------------------------------

1206
Unnamed: 0                                                       108
min_tpd                                                           10
max_days_bt                                                        1
min_days_active                                                   30
user_list          ['uR0011', 'uR0014', 'uR0015', 'uR0022', 'uR00...
Name: 948, dtype: object
96
1110
2273


In [11]:
#GNB
from sklearn.naive_bayes import GaussianNB
from tensorboard.plugins.hparams import api as hp
import uuid

HP_VAR_SMOOTHING = hp.HParam('var_smoothing', hp.RealInterval(0.0000000001 ,0.000000002))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_GNB_full_metrics_new_split').as_default():
  hp.hparams_config(
    hparams=[HP_VAR_SMOOTHING],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )

var_list = np.arange(0.0000000001,0.0000000021,0.0000000001)



for v in var_list:
            id = uuid.uuid4()
            hparams = {HP_VAR_SMOOTHING: v}
            writer = tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_GNB_full_metrics_new_split/'+str(id))
            with writer.as_default():
                classifier_pipe = make_pipeline(StandardScaler(), GaussianNB(var_smoothing=v))

                start = time.perf_counter()
                classifier_pipe.fit(X_train, y_train)
                end = time.perf_counter()
                train_time = end-start

                score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

                start = time.perf_counter()
                y_pred = classifier_pipe.predict(X_test)
                end = time.perf_counter()
                test_time = end-start

                precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
                hp.hparams(hparams)

                tf.summary.scalar(METRIC_F1, f1, step=1)
                tf.summary.scalar(METRIC_SCORE, score, step=1)
                tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                tf.summary.scalar(METRIC_RECALL, recall, step=1)
                tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [12]:
#QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from tensorboard.plugins.hparams import api as hp
import uuid
HP_REG_PARAM = hp.HParam('reg_param', hp.RealInterval(0.0,1.))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_QDA_full_metrics_new_split').as_default():
  hp.hparams_config(
    hparams=[HP_REG_PARAM],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )

reg_list = np.arange(0.0, 1.1, 0.1)

for r in reg_list:
            id = uuid.uuid4()
            hparams = {HP_REG_PARAM: r}
            writer = tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_QDA_full_metrics_new_split/'+str(id))
            with writer.as_default():
                classifier_pipe = make_pipeline(StandardScaler(), QuadraticDiscriminantAnalysis(reg_param=r))

                start = time.perf_counter()
                classifier_pipe.fit(X_train, y_train)
                end = time.perf_counter()
                train_time = end-start

                score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

                start = time.perf_counter()
                y_pred = classifier_pipe.predict(X_test)
                end = time.perf_counter()
                test_time = end-start

                precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
                hp.hparams(hparams)

                tf.summary.scalar(METRIC_F1, f1, step=1)
                tf.summary.scalar(METRIC_SCORE, score, step=1)
                tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                tf.summary.scalar(METRIC_RECALL, recall, step=1)
                tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [18]:
# now with n_depression *2 for double rs size
#depression tweets
dt = pd.read_csv("data/depression_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#dt["created_at"] = pd.to_datetime(dt["created_at"])
n_depression = len(dt['user_id'].unique())*2
print(n_depression)

#sample tweets
st = pd.read_csv("data/sample_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#st["created_at"] = pd.to_datetime(st["created_at"])

#filtered users
fu = pd.read_csv('filtered_user_lists.csv')
fu_tbt = pd.read_csv('filtered_user_lists_only_tpd_tbt.csv')
fu_maxtime = pd.read_csv('filtered_user_lists_only_tpd_maxtime.csv')
#tpd pairs
tpd_pairs = pd.read_csv('pairs_tpd_users.csv')
filtered_users = pd.concat([fu, fu_tbt, fu_maxtime], ignore_index=True)
#-------------------------------------------------------------------
j = 948
current = filtered_users.iloc[j]
print(current)
current_user_list = eval(current.user_list)

#get distribution of current filtered list
avg_dist = {}
current_tpds = tpd_pairs[tpd_pairs['user id'].isin(set(current_user_list))]
for avg in current_tpds['tweets per day'].unique():
    tmp_dist = float((current_tpds[current_tpds['tweets per day']==avg].count()/len(current_tpds)).iloc[0])
    avg_dist.update({str(avg): str(tmp_dist)})

dist_df = pd.DataFrame(avg_dist.items(), columns=['tweets per day', 'percentage'])
dist_df = dist_df.astype({'tweets per day': 'int32'})
dist_df = dist_df.astype({'percentage': 'float'})
dist_df = dist_df.sort_values(by=['tweets per day'])

#downsample current user list based on distribution
current_samples = pd.DataFrame(columns=['user id', 'tweets per day'])
for tpd in current_tpds['tweets per day'].unique():
    tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
    frac = float(dist_df.loc[dist_df['tweets per day']==tpd]['percentage'].iloc[0])
    if int(n_depression*frac)<len(tmp_df):
        sampled_tmp_df = tmp_df.sample(n=int(n_depression*frac), random_state=42)
    else:
        sampled_tmp_df = tmp_df
    current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)

present_tpd = current_samples['tweets per day'].unique().tolist()
all_tpd = dist_df['tweets per day'].unique().tolist()
diff_tpd = list(set(present_tpd).symmetric_difference(set(all_tpd)))
dist_df = dist_df.sort_values('percentage', ascending=False) #sort percentages here

#fill up remaining spots with random users from rs cohort not yet present in sampled users starting with most occuring tpd class in filtered user list (all tpd not already in sampled list)
needed_tpd = []
counter = n_depression-len(current_samples['user id'].unique())
if counter > 0:
    for tpd in dist_df['tweets per day'].unique():
        if tpd in diff_tpd and counter > 0:
            tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
            #make sure to not sample a user again
            sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
            if current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                unique = False
                stop = len(current_tpds['user id'].unique())
                while (not unique) or (stop>0):
                    sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
                    if not current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                        unique = True
                    stop = stop - 1

            current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)
            counter = counter - 1
print(counter)
print(len(current_samples['user id'].unique()))
#if still space left, fill again, this time not caring if tpd is already in final list with samples from all users in rs cohort, with disregard to filtered users

current_user_list = list(set(st.user_id.unique()).symmetric_difference(set(current_samples['user id'].unique())))

avg_dist = {}
current_tpds = tpd_pairs[tpd_pairs['user id'].isin(set(current_user_list))]
for avg in current_tpds['tweets per day'].unique():
    tmp_dist = float((current_tpds[current_tpds['tweets per day']==avg].count()/len(current_tpds)).iloc[0])
    avg_dist.update({str(avg): str(tmp_dist)})

dist_df = pd.DataFrame(avg_dist.items(), columns=['tweets per day', 'percentage'])
dist_df = dist_df.astype({'tweets per day': 'int32'})
dist_df = dist_df.astype({'percentage': 'float'})
dist_df = dist_df.sort_values(by=['tweets per day'])

for tpd in current_tpds['tweets per day'].unique():
    tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
    frac = float(dist_df.loc[dist_df['tweets per day']==tpd]['percentage'].iloc[0])
    if int(n_depression*frac)<len(tmp_df):
        sampled_tmp_df = tmp_df.sample(n=int(n_depression*frac), random_state=42)
    else:
        sampled_tmp_df = tmp_df
    current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)

present_tpd = current_samples['tweets per day'].unique().tolist()
all_tpd = dist_df['tweets per day'].unique().tolist()
diff_tpd = list(set(present_tpd).symmetric_difference(set(all_tpd)))
dist_df = dist_df.sort_values('percentage', ascending=False) #sort percentages here

needed_tpd = []
counter = n_depression-len(current_samples['user id'].unique())
while counter > 0:
    for tpd in dist_df['tweets per day'].unique():
        if tpd in diff_tpd and counter > 0:
            tmp_df = current_tpds[current_tpds['tweets per day']==tpd]
            #make sure to not sample a user again
            sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
            if current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                unique = False
                stop = len(current_tpds['user id'].unique())
                while (not unique) or (stop>0):
                    sampled_tmp_df = tmp_df.sample(n=1, random_state=42)
                    if not current_samples['user id'].eq(sampled_tmp_df['user id'].tolist()[0]).any():
                        unique = True
                    stop = stop - 1

            current_samples = current_samples._append(sampled_tmp_df, ignore_index=True)
            counter = counter - 1

print(len(current_samples['user id'].unique()))
#assamble data
data = {}
labels = []
for user in dt.user_id.unique():
    tmp = dt[dt['user_id'] == user]
    tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
    data.update({''+user+'': tweet_list})
    labels.append(1)

for user in current_samples['user id'].unique():
    tmp = st[st['user_id'] == user]
    tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
    data.update({''+user+'': tweet_list})
    labels.append(0)

df_data = pd.DataFrame(dict([(key, pd.Series(value)) for key, value in data.items()]))
df_labels = pd.DataFrame(labels)
#----------------------------------------------------
from sklearn.model_selection import train_test_split

df_data = df_data.transpose()
df_data.index = range(0, len(df_data.index))
df_data.sort_index()
df_data = df_data.fillna(0)
X_train, X_test, y_train, y_test = train_test_split(df_data, df_labels, test_size=0.2)
#---------------------------------------------------------------------------------
#SVM
from tensorboard.plugins.hparams import api as hp
import uuid

HP_C = hp.HParam('c_values', hp.RealInterval(0.1, 5.))
HP_GAMMA = hp.HParam('gamma', hp.Discrete(['scale', 'auto']))
HP_KERNEL = hp.HParam('kernel', hp.Discrete(['linear', 'poly', 'rbf', 'sigmoid']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_svm_full_metrics_new_split').as_default():
    hp.hparams_config(
        hparams=[HP_C,
                 HP_GAMMA,
                 HP_KERNEL],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

#writer = tf.summary.create_file_writer('log/classifier_test_test')

c_list = np.arange(0.1, 5.1, 0.1)
kernel_list = ['linear', 'poly', 'rbf', 'sigmoid']
gamma_list = ['scale', 'auto']

for c in c_list:
    for k in kernel_list:
        for g in gamma_list:
            id = uuid.uuid4()
            hparams = {HP_C: c, HP_KERNEL: k, HP_GAMMA: g}
            writer = tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_svm_full_metrics_new_split/' + str(id))
            with writer.as_default():
                classifier_pipe = make_pipeline(StandardScaler(), SVC(C=c, kernel=k, gamma=g))

                start = time.perf_counter()
                classifier_pipe.fit(X_train, y_train)
                end = time.perf_counter()
                train_time = end - start

                score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

                start = time.perf_counter()
                y_pred = classifier_pipe.predict(X_test)
                end = time.perf_counter()
                test_time = end - start

                precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                           average='binary', pos_label=1)
                hp.hparams(hparams)

                tf.summary.scalar(METRIC_F1, f1, step=1)
                tf.summary.scalar(METRIC_SCORE, score, step=1)
                tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                tf.summary.scalar(METRIC_RECALL, recall, step=1)
                tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#svm
#knn
import sklearn

HP_NEIGHBORS = hp.HParam('neighbors', hp.RealInterval(1., 10.))
HP_WEIGHTS = hp.HParam('weights', hp.Discrete(['uniform', 'distance']))
HP_ALGO = hp.HParam('algorithm', hp.Discrete(['auto', 'ball_tree', 'kd_tree', 'brute']))
HP_LEAF = hp.HParam('leaf_size', hp.RealInterval(1., 60.))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_knn_full_metrics_new_split_cosine').as_default():
    hp.hparams_config(
        hparams=[HP_NEIGHBORS,
                 HP_WEIGHTS,
                 HP_ALGO,
                 HP_LEAF],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

n_list = np.arange(1, 10, 1)
weight_list = ['uniform', 'distance']
alog_list = ['auto', 'ball_tree', 'kd_tree', 'brute']
leaf_list = np.arange(1, 60, 1)
#p_list = np.arange(1,2,1)


for n in n_list:
    for w in weight_list:
        for a in alog_list:
            for l in leaf_list:
                id = uuid.uuid4()
                hparams = {HP_NEIGHBORS: n, HP_WEIGHTS: w, HP_ALGO: a, HP_LEAF: l}
                writer = tf.summary.create_file_writer(
                    'final_log_doubleRS_user_group_948/classifier_test_knn_full_metrics_new_split_cosine/' + str(id))
                if a != 'auto' and 'cosine' not in sklearn.neighbors.VALID_METRICS[a]:
                    continue
                with writer.as_default():
                    classifier_pipe = make_pipeline(StandardScaler(),
                                                    KNeighborsClassifier(n_neighbors=n, weights=w, algorithm=a,
                                                                         leaf_size=l, metric='cosine'))

                    start = time.perf_counter()
                    classifier_pipe.fit(X_train, y_train)
                    end = time.perf_counter()
                    train_time = end - start

                    score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

                    start = time.perf_counter()
                    y_pred = classifier_pipe.predict(X_test)
                    end = time.perf_counter()
                    test_time = end - start

                    precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                               average='binary', pos_label=1)
                    hp.hparams(hparams)
                    tf.summary.scalar(METRIC_F1, f1, step=1)
                    tf.summary.scalar(METRIC_SCORE, score, step=1)
                    tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                    tf.summary.scalar(METRIC_RECALL, recall, step=1)
                    tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                    tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#knn

#GPC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct

rbf = RBF()
matern = Matern()
rqk = RationalQuadratic()
ess = ExpSineSquared()
dpk = DotProduct()

HP_RESTARTS = hp.HParam('optimizer_restarts', hp.RealInterval(0., 5.))
HP_ITER = hp.HParam('max_iterations', hp.RealInterval(50., 150.))
HP_KERNELS = hp.HParam('kernels', hp.Discrete(['rbf', 'matern', 'rqk', 'dpk']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_GPC_full_metrics_new_split_kernels').as_default():
    hp.hparams_config(
        hparams=[HP_RESTARTS,
                 HP_ITER,
                 HP_KERNELS],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

restart_list = np.arange(0, 5, 1)
iter_list = np.arange(50, 150, 5)
kernel_list = [rbf, matern, rqk, dpk]
kernel_names = ['rbf', 'matern', 'rqk', 'dpk']

for r in restart_list:
    for j in iter_list:
        for k in range(len(kernel_list)):
            id = uuid.uuid4()
            hparams = {HP_RESTARTS: r, HP_ITER: j, HP_KERNELS: kernel_names[k]}
            writer = tf.summary.create_file_writer(
                'final_log_doubleRS_user_group_948/classifier_test_GPC_full_metrics_new_split_kernels/' + str(id))
            with writer.as_default():
                classifier_pipe = make_pipeline(StandardScaler(),
                                                GaussianProcessClassifier(kernel=kernel_list[k], n_restarts_optimizer=r,
                                                                          max_iter_predict=j, random_state=42))

                start = time.perf_counter()
                classifier_pipe.fit(X_train, y_train)
                end = time.perf_counter()
                train_time = end - start

                score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

                start = time.perf_counter()
                y_pred = classifier_pipe.predict(X_test)
                end = time.perf_counter()
                test_time = end - start

                precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                           average='binary', pos_label=1)
                hp.hparams(hparams)
                tf.summary.scalar(METRIC_F1, f1, step=1)
                tf.summary.scalar(METRIC_SCORE, score, step=1)
                tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                tf.summary.scalar(METRIC_RECALL, recall, step=1)
                tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#GPC
#DTC
from tensorboard.plugins.hparams import api as hp
import uuid
from sklearn.tree import DecisionTreeClassifier

HP_CRITERION = hp.HParam('criterion', hp.Discrete(['gini', 'entropy', 'log_loss']))
HP_SPLITTER = hp.HParam('splitter', hp.Discrete(['best', 'random']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_DTC_full_metrics_new_split').as_default():
    hp.hparams_config(
        hparams=[HP_CRITERION,
                 HP_SPLITTER],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

criterion_list = ['gini', 'entropy', 'log_loss']
splitter_list = ['best', 'random']
max_features_list = ['auto', 'sqrt', 'log2', 'None']

for c in criterion_list:
    for s in splitter_list:
        id = uuid.uuid4()
        hparams = {HP_CRITERION: c, HP_SPLITTER: s}
        writer = tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_DTC_full_metrics_new_split/' + str(id))
        with writer.as_default():
            classifier_pipe = make_pipeline(StandardScaler(),
                                            DecisionTreeClassifier(criterion=c, splitter=s, random_state=42))

            start = time.perf_counter()
            classifier_pipe.fit(X_train, y_train)
            end = time.perf_counter()
            train_time = end - start

            score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

            start = time.perf_counter()
            y_pred = classifier_pipe.predict(X_test)
            end = time.perf_counter()
            test_time = end - start

            precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                       average='binary', pos_label=1)
            hp.hparams(hparams)
            tf.summary.scalar(METRIC_F1, f1, step=1)
            tf.summary.scalar(METRIC_SCORE, score, step=1)
            tf.summary.scalar(METRIC_PRECISION, precision, step=1)
            tf.summary.scalar(METRIC_RECALL, recall, step=1)
            tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
            tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#DTC
#RFC
from sklearn.ensemble import RandomForestClassifier

HP_ESTIMATORS = hp.HParam('estimators', hp.RealInterval(10., 150.))
HP_RFC_CRITERION = hp.HParam('criterion', hp.Discrete(['gini', 'entropy', 'log_loss']))
HP_RFC_MAX_FEATURES = hp.HParam('max_features', hp.Discrete(['sqrt', 'log2']))
HP_CLASS_WEIGHT = hp.HParam('class_weights', hp.Discrete(['None', 'balanced', 'balanced_subsample']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_RFC_full_metrics_new_split').as_default():
    hp.hparams_config(
        hparams=[HP_ESTIMATORS,
                 HP_RFC_CRITERION,
                 HP_RFC_MAX_FEATURES,
                 HP_CLASS_WEIGHT],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

estimator_list = np.arange(10, 150, 5)
criterion_list = ['gini', 'entropy', 'log_loss']
max_features_list = ['sqrt', 'log2']
weight_list = ['balanced', 'balanced_subsample']

for e in estimator_list:
    for c in criterion_list:
        for f in max_features_list:
            for w in weight_list:
                id = uuid.uuid4()
                hparams = {HP_ESTIMATORS: e, HP_RFC_CRITERION: c, HP_RFC_MAX_FEATURES: f, HP_CLASS_WEIGHT: w}
                writer = tf.summary.create_file_writer(
                    'final_log_doubleRS_user_group_948/classifier_test_RFC_full_metrics_new_split/' + str(id))
                with writer.as_default():
                    classifier_pipe = make_pipeline(StandardScaler(),
                                                    RandomForestClassifier(n_estimators=e, criterion=c, max_features=f,
                                                                           class_weight=w, random_state=42))

                    start = time.perf_counter()
                    classifier_pipe.fit(X_train, y_train)
                    end = time.perf_counter()
                    train_time = end - start

                    score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

                    start = time.perf_counter()
                    y_pred = classifier_pipe.predict(X_test)
                    end = time.perf_counter()
                    test_time = end - start

                    precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                               average='binary', pos_label=1)
                    hp.hparams(hparams)
                    tf.summary.scalar(METRIC_F1, f1, step=1)
                    tf.summary.scalar(METRIC_SCORE, score, step=1)
                    tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                    tf.summary.scalar(METRIC_RECALL, recall, step=1)
                    tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                    tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#RFC
#ADB
from sklearn.ensemble import AdaBoostClassifier

clf1 = SVC(C=4.9, gamma='auto', kernel='rbf', random_state=42)
clf2 = DecisionTreeClassifier(criterion='log_loss', splitter='random', max_features='auto', random_state=42)

HP_ADB_N_ESTIMATOR = hp.HParam('n estimator', hp.RealInterval(25., 75.))
HP_ADB_LEARNING_RATE = hp.HParam('learning_rate', hp.RealInterval(1., 5.))
HP_ADB_ALGO = hp.HParam('algorithm', hp.Discrete(['SAMME', 'SAMME.R']))
HP_ADB_ESTIMATOR = hp.HParam('estimator', hp.Discrete(['SVC', 'DTC']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_ADB_full_metrics_new_split').as_default():
    hp.hparams_config(
        hparams=[HP_ADB_ESTIMATOR,
                 HP_ADB_N_ESTIMATOR,
                 HP_ADB_LEARNING_RATE,
                 HP_ADB_ALGO],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

n_estimator_list = np.arange(25, 75, 5)
learning_rate_list = np.arange(1., 5., 0.2)
algo_list = ['SAMME', 'SAMME.R']
estimator_list = [clf1, clf2]
estimator_name_list = ['SVC', 'DTC']

for c in range(len(estimator_list)):
    for e in n_estimator_list:
        for l in learning_rate_list:
            for a in algo_list:
                if (estimator_name_list[c] == 'SVC' and a == 'SAMME.R'):
                    continue
                id = uuid.uuid4()
                hparams = {HP_ADB_ESTIMATOR: estimator_name_list[c], HP_ADB_N_ESTIMATOR: e, HP_ADB_LEARNING_RATE: l,
                           HP_ADB_ALGO: a}
                writer = tf.summary.create_file_writer(
                    'final_log_doubleRS_user_group_948/classifier_test_ADB_full_metrics_new_split/' + str(id))
                with writer.as_default():
                    classifier_pipe = make_pipeline(StandardScaler(),
                                                    AdaBoostClassifier(estimator=estimator_list[c], n_estimators=e,
                                                                       learning_rate=l, algorithm=a, random_state=42))

                    start = time.perf_counter()
                    classifier_pipe.fit(X_train, y_train)
                    end = time.perf_counter()
                    train_time = end - start

                    score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

                    start = time.perf_counter()
                    y_pred = classifier_pipe.predict(X_test)
                    end = time.perf_counter()
                    test_time = end - start

                    precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                               average='binary', pos_label=1)
                    hp.hparams(hparams)
                    tf.summary.scalar(METRIC_F1, f1, step=1)
                    tf.summary.scalar(METRIC_SCORE, score, step=1)
                    tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                    tf.summary.scalar(METRIC_RECALL, recall, step=1)
                    tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                    tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#ADB
#MLP
from sklearn.neural_network import MLPClassifier
from tensorboard.plugins.hparams import api as hp
import uuid

HP_ACTIVATION = hp.HParam('activation', hp.Discrete(['identity', 'logistic', 'tanh', 'relu']))
HP_SOLVER = hp.HParam('solver', hp.Discrete(['lbfgs', 'sgd', 'adam']))
HP_ALPHA = hp.HParam('alpha', hp.RealInterval(0.0001, 1.))
HP_LEARNING_RATE = hp.HParam('learning_rate', hp.Discrete(['constant', 'invscaling', 'adaptive']))
HP_LEARNING_RATE_INIT = hp.HParam('learning_rate_init', hp.RealInterval(0.001, 1.))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_doubleRS_user_group_948/classifier_test_MLP_smaller_metrics_new_split').as_default():
    hp.hparams_config(
        hparams=[HP_ACTIVATION,
                 HP_SOLVER,
                 HP_ALPHA,
                 HP_LEARNING_RATE,
                 HP_LEARNING_RATE_INIT],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

activation_list = ['identity', 'logistic', 'tanh', 'relu']
solver_list = ['lbfgs', 'sgd', 'adam']
alpha_list = [0.0001, 0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.005, 0.0075, 0.01, 0.015, 0.02, 0.025, 0.05, 0.075, 0.1,
              0.15, 0.2, 0.25, 0.5, 0.75, 1]
learning_rate_list = ['constant', 'invscaling', 'adaptive']
learning_rate_init_list = [0.001, 0.0015, 0.002, 0.0025, 0.005, 0.0075, 0.01, 0.015, 0.02, 0.025, 0.05, 0.075, 0.1,
                           0.15, 0.2, 0.25, 0.5, 0.75, 1]

for a in activation_list:
    for s in solver_list:
        for al in alpha_list:
            for l in learning_rate_list:
                for r in learning_rate_init_list:
                    id = uuid.uuid4()
                    hparams = {HP_ACTIVATION: a, HP_SOLVER: s, HP_ALPHA: al, HP_LEARNING_RATE: l,
                               HP_LEARNING_RATE_INIT: r}
                    writer = tf.summary.create_file_writer(
                        'final_log_doubleRS_user_group_948/classifier_test_MLP_smaller_metrics_new_split/' + str(id))
                    with writer.as_default():
                        classifier_pipe = make_pipeline(StandardScaler(),
                                                        MLPClassifier(activation=a, solver=s, alpha=al, learning_rate=l,
                                                                      learning_rate_init=r, random_state=42))

                        start = time.perf_counter()
                        classifier_pipe.fit(X_train, y_train)
                        end = time.perf_counter()
                        train_time = end - start

                        score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

                        start = time.perf_counter()
                        y_pred = classifier_pipe.predict(X_test)
                        end = time.perf_counter()
                        test_time = end - start

                        precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                                   average='binary', pos_label=1)
                        hp.hparams(hparams)
                        tf.summary.scalar(METRIC_F1, f1, step=1)
                        tf.summary.scalar(METRIC_SCORE, score, step=1)
                        tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                        tf.summary.scalar(METRIC_RECALL, recall, step=1)
                        tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                        tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#MLP

1206
Unnamed: 0                                                       108
min_tpd                                                           10
max_days_bt                                                        1
min_days_active                                                   30
user_list          ['uR0011', 'uR0014', 'uR0015', 'uR0022', 'uR00...
Name: 948, dtype: object
96
1110
2273


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, l

ValueError: Solver produced non-finite parameter weights. The input data may contain large values and need to be preprocessed.

In [6]:
# no downsampling
#depression tweets
from tensorboard.plugins.hparams import api as hp
import uuid
dt = pd.read_csv("data/depression_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#dt["created_at"] = pd.to_datetime(dt["created_at"])
n_depression = len(dt['user_id'].unique())*2
print(n_depression)

#sample tweets
st = pd.read_csv("data/sample_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#st["created_at"] = pd.to_datetime(st["created_at"])


#assamble data
data = {}
labels = []
for user in dt.user_id.unique():
    tmp = dt[dt['user_id'] == user]
    tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
    data.update({''+user+'': tweet_list})
    labels.append(1)

for user in st['user_id'].unique():
    tmp = st[st['user_id'] == user]
    tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
    data.update({''+user+'': tweet_list})
    labels.append(0)

df_data = pd.DataFrame(dict([(key, pd.Series(value)) for key, value in data.items()]))
df_labels = pd.DataFrame(labels)
#----------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

df_data = df_data.transpose()
df_data.index = range(0, len(df_data.index))
df_data.sort_index()
df_data = df_data.fillna(0)
X_train, X_test, y_train, y_test = train_test_split(df_data, df_labels, test_size=0.2)


#---------------------------------------------------------------------------------
df_labels.shape

1206


(7952, 1)

In [15]:
#GNB
from sklearn.naive_bayes import GaussianNB
from tensorboard.plugins.hparams import api as hp
import uuid

HP_VAR_SMOOTHING = hp.HParam('var_smoothing', hp.RealInterval(0.0000000001 ,0.000000002))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_fullRS/classifier_test_GNB_full_metrics_new_split').as_default():
  hp.hparams_config(
    hparams=[HP_VAR_SMOOTHING],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )

var_list = np.arange(0.0000000001,0.0000000021,0.0000000001)



for v in var_list:
            id = uuid.uuid4()
            hparams = {HP_VAR_SMOOTHING: v}
            writer = tf.summary.create_file_writer('final_log_fullRS/classifier_test_GNB_full_metrics_new_split/'+str(id))
            with writer.as_default():
                classifier_pipe = make_pipeline(StandardScaler(), GaussianNB(var_smoothing=v))

                start = time.perf_counter()
                classifier_pipe.fit(X_train, y_train)
                end = time.perf_counter()
                train_time = end-start

                score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

                start = time.perf_counter()
                y_pred = classifier_pipe.predict(X_test)
                end = time.perf_counter()
                test_time = end-start

                precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
                hp.hparams(hparams)

                tf.summary.scalar(METRIC_F1, f1, step=1)
                tf.summary.scalar(METRIC_SCORE, score, step=1)
                tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                tf.summary.scalar(METRIC_RECALL, recall, step=1)
                tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [14]:
#QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from tensorboard.plugins.hparams import api as hp
import uuid
HP_REG_PARAM = hp.HParam('reg_param', hp.RealInterval(0.0,1.))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_fullRS/classifier_test_QDA_full_metrics_new_split').as_default():
  hp.hparams_config(
    hparams=[HP_REG_PARAM],
    metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
             (hp.Metric(METRIC_PRECISION, display_name='precision')),
             (hp.Metric(METRIC_F1, display_name='f1')),
             (hp.Metric(METRIC_SCORE, display_name='score')),
             (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
             (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
  )

reg_list = np.arange(0.0, 1.1, 0.1)

for r in reg_list:
            id = uuid.uuid4()
            hparams = {HP_REG_PARAM: r}
            writer = tf.summary.create_file_writer('final_log_fullRS/classifier_test_QDA_full_metrics_new_split/'+str(id))
            with writer.as_default():
                classifier_pipe = make_pipeline(StandardScaler(), QuadraticDiscriminantAnalysis(reg_param=r))

                start = time.perf_counter()
                classifier_pipe.fit(X_train, y_train)
                end = time.perf_counter()
                train_time = end-start

                score = classifier_pipe.score(X_test, y_test.to_numpy()[:,0])

                start = time.perf_counter()
                y_pred = classifier_pipe.predict(X_test)
                end = time.perf_counter()
                test_time = end-start

                precision, recall, f1, _ =precision_recall_fscore_support(y_test.to_numpy()[:,0], y_pred, average='binary', pos_label=1)
                hp.hparams(hparams)

                tf.summary.scalar(METRIC_F1, f1, step=1)
                tf.summary.scalar(METRIC_SCORE, score, step=1)
                tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                tf.summary.scalar(METRIC_RECALL, recall, step=1)
                tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
# no downsampling
#depression tweets
from tensorboard.plugins.hparams import api as hp
import uuid
dt = pd.read_csv("data/depression_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#dt["created_at"] = pd.to_datetime(dt["created_at"])
n_depression = len(dt['user_id'].unique())*2
print(n_depression)

#sample tweets
st = pd.read_csv("data/sample_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#st["created_at"] = pd.to_datetime(st["created_at"])


#assamble data
data = {}
labels = []
for user in dt.user_id.unique():
    tmp = dt[dt['user_id'] == user]
    tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
    data.update({''+user+'': tweet_list})
    labels.append(1)

for user in st['user_id'].unique():
    tmp = st[st['user_id'] == user]
    tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
    data.update({''+user+'': tweet_list})
    labels.append(0)

df_data = pd.DataFrame(dict([(key, pd.Series(value)) for key, value in data.items()]))
df_labels = pd.DataFrame(labels)
#----------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

df_data = df_data.transpose()
df_data.index = range(0, len(df_data.index))
df_data.sort_index()
df_data = df_data.fillna(0)
X_train, X_test, y_train, y_test = train_test_split(df_data, df_labels, test_size=0.2)


#---------------------------------------------------------------------------------
#SVM
from tensorboard.plugins.hparams import api as hp
import uuid

HP_C = hp.HParam('c_values', hp.RealInterval(0.1, 5.))
HP_GAMMA = hp.HParam('gamma', hp.Discrete(['scale', 'auto']))
HP_KERNEL = hp.HParam('kernel', hp.Discrete(['linear', 'poly', 'rbf', 'sigmoid']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_fullRS/classifier_test_svm_full_metrics_new_split').as_default():
    hp.hparams_config(
        hparams=[HP_C,
                 HP_GAMMA,
                 HP_KERNEL],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

#writer = tf.summary.create_file_writer('log/classifier_test_test')

c_list = np.arange(0.1, 5.1, 0.1)
kernel_list = ['linear', 'poly', 'rbf', 'sigmoid']
gamma_list = ['scale', 'auto']

for c in c_list:
    for k in kernel_list:
        for g in gamma_list:
            id = uuid.uuid4()
            hparams = {HP_C: c, HP_KERNEL: k, HP_GAMMA: g}
            writer = tf.summary.create_file_writer('final_log_fullRS/classifier_test_svm_full_metrics_new_split/' + str(id))
            with writer.as_default():
                classifier_pipe = make_pipeline(StandardScaler(), SVC(C=c, kernel=k, gamma=g))

                start = time.perf_counter()
                classifier_pipe.fit(X_train, y_train)
                end = time.perf_counter()
                train_time = end - start

                score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

                start = time.perf_counter()
                y_pred = classifier_pipe.predict(X_test)
                end = time.perf_counter()
                test_time = end - start

                precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                           average='binary', pos_label=1)
                hp.hparams(hparams)

                tf.summary.scalar(METRIC_F1, f1, step=1)
                tf.summary.scalar(METRIC_SCORE, score, step=1)
                tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                tf.summary.scalar(METRIC_RECALL, recall, step=1)
                tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#svm
#knn
import sklearn

HP_NEIGHBORS = hp.HParam('neighbors', hp.RealInterval(1., 10.))
HP_WEIGHTS = hp.HParam('weights', hp.Discrete(['uniform', 'distance']))
HP_ALGO = hp.HParam('algorithm', hp.Discrete(['auto', 'ball_tree', 'kd_tree', 'brute']))
HP_LEAF = hp.HParam('leaf_size', hp.RealInterval(1., 60.))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_fullRS/classifier_test_knn_full_metrics_new_split_cosine').as_default():
    hp.hparams_config(
        hparams=[HP_NEIGHBORS,
                 HP_WEIGHTS,
                 HP_ALGO,
                 HP_LEAF],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

n_list = np.arange(1, 10, 1)
weight_list = ['uniform', 'distance']
alog_list = ['auto', 'ball_tree', 'kd_tree', 'brute']
leaf_list = np.arange(1, 60, 1)
#p_list = np.arange(1,2,1)


for n in n_list:
    for w in weight_list:
        for a in alog_list:
            for l in leaf_list:
                id = uuid.uuid4()
                hparams = {HP_NEIGHBORS: n, HP_WEIGHTS: w, HP_ALGO: a, HP_LEAF: l}
                writer = tf.summary.create_file_writer(
                    'final_log_fullRS/classifier_test_knn_full_metrics_new_split_cosine/' + str(id))
                if a != 'auto' and 'cosine' not in sklearn.neighbors.VALID_METRICS[a]:
                    continue
                with writer.as_default():
                    classifier_pipe = make_pipeline(StandardScaler(),
                                                    KNeighborsClassifier(n_neighbors=n, weights=w, algorithm=a,
                                                                         leaf_size=l, metric='cosine'))

                    start = time.perf_counter()
                    classifier_pipe.fit(X_train, y_train)
                    end = time.perf_counter()
                    train_time = end - start

                    score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

                    start = time.perf_counter()
                    y_pred = classifier_pipe.predict(X_test)
                    end = time.perf_counter()
                    test_time = end - start

                    precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                               average='binary', pos_label=1)
                    hp.hparams(hparams)
                    tf.summary.scalar(METRIC_F1, f1, step=1)
                    tf.summary.scalar(METRIC_SCORE, score, step=1)
                    tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                    tf.summary.scalar(METRIC_RECALL, recall, step=1)
                    tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                    tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#knn

#GPC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct
from tensorboard.plugins.hparams import api as hp
import uuid

rbf = RBF()
matern = Matern()
rqk = RationalQuadratic()
ess = ExpSineSquared()
dpk = DotProduct()

HP_RESTARTS = hp.HParam('optimizer_restarts', hp.RealInterval(0., 5.))
HP_ITER = hp.HParam('max_iterations', hp.RealInterval(50., 150.))
HP_KERNELS = hp.HParam('kernels', hp.Discrete(['rbf', 'matern', 'rqk', 'dpk']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_fullRS/classifier_test_GPC_full_metrics_new_split_kernels_4').as_default():
    hp.hparams_config(
        hparams=[HP_RESTARTS,
                 HP_ITER,
                 HP_KERNELS],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

restart_list = np.arange(3, 5, 1)
iter_list = np.arange(50, 150, 5)
kernel_list = [rbf, matern, rqk, dpk]
kernel_names = ['rbf', 'matern', 'rqk', 'dpk']

for r in restart_list:
    for j in iter_list:
        for k in range(len(kernel_list)):
            id = uuid.uuid4()
            hparams = {HP_RESTARTS: r, HP_ITER: j, HP_KERNELS: kernel_names[k]}
            writer = tf.summary.create_file_writer(
                'final_log_fullRS/classifier_test_GPC_full_metrics_new_split_kernels_4/' + str(id))
            with writer.as_default():
                classifier_pipe = make_pipeline(StandardScaler(),
                                                GaussianProcessClassifier(kernel=kernel_list[k], n_restarts_optimizer=r,
                                                                          max_iter_predict=j, random_state=42))

                start = time.perf_counter()
                classifier_pipe.fit(X_train, y_train)
                end = time.perf_counter()
                train_time = end - start

                score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

                start = time.perf_counter()
                y_pred = classifier_pipe.predict(X_test)
                end = time.perf_counter()
                test_time = end - start

                precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                           average='binary', pos_label=1)
                hp.hparams(hparams)
                tf.summary.scalar(METRIC_F1, f1, step=1)
                tf.summary.scalar(METRIC_SCORE, score, step=1)
                tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                tf.summary.scalar(METRIC_RECALL, recall, step=1)
                tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#GPC
#DTC
from tensorboard.plugins.hparams import api as hp
import uuid
from sklearn.tree import DecisionTreeClassifier

HP_CRITERION = hp.HParam('criterion', hp.Discrete(['gini', 'entropy', 'log_loss']))
HP_SPLITTER = hp.HParam('splitter', hp.Discrete(['best', 'random']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_fullRS/classifier_test_DTC_full_metrics_new_split').as_default():
    hp.hparams_config(
        hparams=[HP_CRITERION,
                 HP_SPLITTER],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

criterion_list = ['gini', 'entropy', 'log_loss']
splitter_list = ['best', 'random']
max_features_list = ['auto', 'sqrt', 'log2', 'None']

for c in criterion_list:
    for s in splitter_list:
        id = uuid.uuid4()
        hparams = {HP_CRITERION: c, HP_SPLITTER: s}
        writer = tf.summary.create_file_writer('final_log_fullRS/classifier_test_DTC_full_metrics_new_split/' + str(id))
        with writer.as_default():
            classifier_pipe = make_pipeline(StandardScaler(),
                                            DecisionTreeClassifier(criterion=c, splitter=s, random_state=42))

            start = time.perf_counter()
            classifier_pipe.fit(X_train, y_train)
            end = time.perf_counter()
            train_time = end - start

            score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

            start = time.perf_counter()
            y_pred = classifier_pipe.predict(X_test)
            end = time.perf_counter()
            test_time = end - start

            precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                       average='binary', pos_label=1)
            hp.hparams(hparams)
            tf.summary.scalar(METRIC_F1, f1, step=1)
            tf.summary.scalar(METRIC_SCORE, score, step=1)
            tf.summary.scalar(METRIC_PRECISION, precision, step=1)
            tf.summary.scalar(METRIC_RECALL, recall, step=1)
            tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
            tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#DTC
#RFC
from sklearn.ensemble import RandomForestClassifier

HP_ESTIMATORS = hp.HParam('estimators', hp.RealInterval(10., 150.))
HP_RFC_CRITERION = hp.HParam('criterion', hp.Discrete(['gini', 'entropy', 'log_loss']))
HP_RFC_MAX_FEATURES = hp.HParam('max_features', hp.Discrete(['sqrt', 'log2']))
HP_CLASS_WEIGHT = hp.HParam('class_weights', hp.Discrete(['None', 'balanced', 'balanced_subsample']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_fullRS/classifier_test_RFC_full_metrics_new_split').as_default():
    hp.hparams_config(
        hparams=[HP_ESTIMATORS,
                 HP_RFC_CRITERION,
                 HP_RFC_MAX_FEATURES,
                 HP_CLASS_WEIGHT],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

estimator_list = np.arange(10, 150, 5)
criterion_list = ['gini', 'entropy', 'log_loss']
max_features_list = ['sqrt', 'log2']
weight_list = ['balanced', 'balanced_subsample']

for e in estimator_list:
    for c in criterion_list:
        for f in max_features_list:
            for w in weight_list:
                id = uuid.uuid4()
                hparams = {HP_ESTIMATORS: e, HP_RFC_CRITERION: c, HP_RFC_MAX_FEATURES: f, HP_CLASS_WEIGHT: w}
                writer = tf.summary.create_file_writer(
                    'final_log_fullRS/classifier_test_RFC_full_metrics_new_split/' + str(id))
                with writer.as_default():
                    classifier_pipe = make_pipeline(StandardScaler(),
                                                    RandomForestClassifier(n_estimators=e, criterion=c, max_features=f,
                                                                           class_weight=w, random_state=42))

                    start = time.perf_counter()
                    classifier_pipe.fit(X_train, y_train)
                    end = time.perf_counter()
                    train_time = end - start

                    score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

                    start = time.perf_counter()
                    y_pred = classifier_pipe.predict(X_test)
                    end = time.perf_counter()
                    test_time = end - start

                    precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                               average='binary', pos_label=1)
                    hp.hparams(hparams)
                    tf.summary.scalar(METRIC_F1, f1, step=1)
                    tf.summary.scalar(METRIC_SCORE, score, step=1)
                    tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                    tf.summary.scalar(METRIC_RECALL, recall, step=1)
                    tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                    tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#RFC
#GNB
from sklearn.naive_bayes import GaussianNB

classifier_pipe = make_pipeline(StandardScaler(), GaussianNB())

start = time.perf_counter()
classifier_pipe.fit(X_train, y_train)
end = time.perf_counter()
train_time = end - start

score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

start = time.perf_counter()
y_pred = classifier_pipe.predict(X_test)
end = time.perf_counter()
test_time = end - start

precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred, average='binary',
                                                           pos_label=1)
print('GNB')
print(score)
print(precision)
print(recall)
print(f1)
print(train_time)
print(test_time)
print('------------')

#GNB
#QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifier_pipe = make_pipeline(StandardScaler(), QuadraticDiscriminantAnalysis())

start = time.perf_counter()
classifier_pipe.fit(X_train, y_train)
end = time.perf_counter()
train_time = end - start

score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

start = time.perf_counter()
y_pred = classifier_pipe.predict(X_test)
end = time.perf_counter()
test_time = end - start

precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred, average='binary',
                                                           pos_label=1)
print('QDA')
print(score)
print(precision)
print(recall)
print(f1)
print(train_time)
print(test_time)
print('------------')

#QDA
#ADB
from sklearn.ensemble import AdaBoostClassifier
from tensorboard.plugins.hparams import api as hp
from sklearn.tree import DecisionTreeClassifier
import uuid


clf1 = SVC(C=4.9, gamma='auto', kernel='rbf', random_state=42)
clf2 = DecisionTreeClassifier(criterion='log_loss', splitter='random', max_features='auto', random_state=42)

HP_ADB_N_ESTIMATOR = hp.HParam('n estimator', hp.RealInterval(25., 75.))
HP_ADB_LEARNING_RATE = hp.HParam('learning_rate', hp.RealInterval(1., 5.))
HP_ADB_ALGO = hp.HParam('algorithm', hp.Discrete(['SAMME', 'SAMME.R']))
HP_ADB_ESTIMATOR = hp.HParam('estimator', hp.Discrete(['SVC', 'DTC']))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_fullRS/classifier_test_ADB_full_metrics_new_split').as_default():
    hp.hparams_config(
        hparams=[HP_ADB_ESTIMATOR,
                 HP_ADB_N_ESTIMATOR,
                 HP_ADB_LEARNING_RATE,
                 HP_ADB_ALGO],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

n_estimator_list = np.arange(25, 75, 5)
learning_rate_list = np.arange(1., 5., 0.2)
algo_list = ['SAMME', 'SAMME.R']
estimator_list = [clf1, clf2]
estimator_name_list = ['SVC', 'DTC']

for c in range(len(estimator_list)):
    for e in n_estimator_list:
        for l in learning_rate_list:
            for a in algo_list:
                if (estimator_name_list[c] == 'SVC' and a == 'SAMME.R'):
                    continue
                id = uuid.uuid4()
                hparams = {HP_ADB_ESTIMATOR: estimator_name_list[c], HP_ADB_N_ESTIMATOR: e, HP_ADB_LEARNING_RATE: l,
                           HP_ADB_ALGO: a}
                writer = tf.summary.create_file_writer(
                    'final_log_fullRS/classifier_test_ADB_full_metrics_new_split/' + str(id))
                with writer.as_default():
                    classifier_pipe = make_pipeline(StandardScaler(),
                                                    AdaBoostClassifier(estimator=estimator_list[c], n_estimators=e,
                                                                       learning_rate=l, algorithm=a, random_state=42))

                    start = time.perf_counter()
                    classifier_pipe.fit(X_train, y_train)
                    end = time.perf_counter()
                    train_time = end - start

                    score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

                    start = time.perf_counter()
                    y_pred = classifier_pipe.predict(X_test)
                    end = time.perf_counter()
                    test_time = end - start

                    precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                               average='binary', pos_label=1)
                    hp.hparams(hparams)
                    tf.summary.scalar(METRIC_F1, f1, step=1)
                    tf.summary.scalar(METRIC_SCORE, score, step=1)
                    tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                    tf.summary.scalar(METRIC_RECALL, recall, step=1)
                    tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                    tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

#ADB
#MLP
from sklearn.neural_network import MLPClassifier
from tensorboard.plugins.hparams import api as hp
import uuid

HP_ACTIVATION = hp.HParam('activation', hp.Discrete(['identity', 'logistic', 'tanh', 'relu']))
HP_SOLVER = hp.HParam('solver', hp.Discrete(['lbfgs', 'sgd', 'adam']))
HP_ALPHA = hp.HParam('alpha', hp.RealInterval(0.0001, 1.))
HP_LEARNING_RATE = hp.HParam('learning_rate', hp.Discrete(['constant', 'invscaling', 'adaptive']))
HP_LEARNING_RATE_INIT = hp.HParam('learning_rate_init', hp.RealInterval(0.001, 1.))

METRIC_RECALL = 'RECALL'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_SCORE = 'score'
METRIC_TRAIN_TIME = 'train time'
METRIC_TEST_TIME = 'test time'

with tf.summary.create_file_writer('final_log_fullRS/classifier_test_MLP_smaller_metrics_new_split').as_default():
    hp.hparams_config(
        hparams=[HP_ACTIVATION,
                 HP_SOLVER,
                 HP_ALPHA,
                 HP_LEARNING_RATE,
                 HP_LEARNING_RATE_INIT],
        metrics=[hp.Metric(METRIC_RECALL, display_name='RECALL'),
                 (hp.Metric(METRIC_PRECISION, display_name='precision')),
                 (hp.Metric(METRIC_F1, display_name='f1')),
                 (hp.Metric(METRIC_SCORE, display_name='score')),
                 (hp.Metric(METRIC_TRAIN_TIME, display_name='train time')),
                 (hp.Metric(METRIC_TEST_TIME, display_name='test time'))],
    )

activation_list = ['identity', 'logistic', 'tanh', 'relu']
solver_list = ['lbfgs', 'sgd', 'adam']
alpha_list = [0.0001, 0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.005, 0.0075, 0.01, 0.015, 0.02, 0.025, 0.05, 0.075, 0.1,
              0.15, 0.2, 0.25, 0.5, 0.75, 1]
learning_rate_list = ['constant', 'invscaling', 'adaptive']
learning_rate_init_list = [0.001, 0.0015, 0.002, 0.0025, 0.005, 0.0075, 0.01, 0.015, 0.02, 0.025, 0.05, 0.075, 0.1,
                           0.15, 0.2, 0.25, 0.5, 0.75, 1]

for a in activation_list:
    for s in solver_list:
        for al in alpha_list:
            for l in learning_rate_list:
                for r in learning_rate_init_list:
                    id = uuid.uuid4()
                    hparams = {HP_ACTIVATION: a, HP_SOLVER: s, HP_ALPHA: al, HP_LEARNING_RATE: l,
                               HP_LEARNING_RATE_INIT: r}
                    writer = tf.summary.create_file_writer(
                        'final_log_fullRS/classifier_test_MLP_smaller_metrics_new_split/' + str(id))
                    with writer.as_default():
                        classifier_pipe = make_pipeline(StandardScaler(),
                                                        MLPClassifier(activation=a, solver=s, alpha=al, learning_rate=l,
                                                                      learning_rate_init=r, random_state=42))

                        start = time.perf_counter()
                        classifier_pipe.fit(X_train, y_train)
                        end = time.perf_counter()
                        train_time = end - start

                        score = classifier_pipe.score(X_test, y_test.to_numpy()[:, 0])

                        start = time.perf_counter()
                        y_pred = classifier_pipe.predict(X_test)
                        end = time.perf_counter()
                        test_time = end - start

                        precision, recall, f1, _ = precision_recall_fscore_support(y_test.to_numpy()[:, 0], y_pred,
                                                                                   average='binary', pos_label=1)
                        hp.hparams(hparams)
                        tf.summary.scalar(METRIC_F1, f1, step=1)
                        tf.summary.scalar(METRIC_SCORE, score, step=1)
                        tf.summary.scalar(METRIC_PRECISION, precision, step=1)
                        tf.summary.scalar(METRIC_RECALL, recall, step=1)
                        tf.summary.scalar(METRIC_TRAIN_TIME, train_time, step=1)
                        tf.summary.scalar(METRIC_TEST_TIME, test_time, step=1)

MLP

1206


  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.or