# Pair Matching
Trying to do down/upsampling by finding close matching pairs between individuals in the depression and sample cohort, meaning the filter criteria are a close match


In [1]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.model_selection
import sktime.datatypes
from sklearn.linear_model import RidgeClassifierCV
from sklearn.pipeline import make_pipeline

from sktime.datasets import load_arrow_head  # univariate dataset
from sktime.datasets import load_basic_motions  # multivariate dataset
from sktime.transformations.panel.rocket import Rocket
from sktime.forecasting.model_selection import temporal_train_test_split
from sklearn import preprocessing
from tqdm import tqdm

In [2]:
#depression tweets
#dt = pd.read_csv("data/depression_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#dt["created_at"] = pd.to_datetime(dt["created_at"])
#n_depression = len(dt['user_id'].unique())
#print(n_depression)

#sample tweets
st = pd.read_csv("data/sample_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
#st["created_at"] = pd.to_datetime(st["created_at"])

In [3]:
average_pairs = {}
dt = pd.read_csv("data/depression_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
dt.set_index('created_at', inplace=True)
for user in dt.user_id.unique():
    tmp = dt[dt['user_id'] == user]
    tweets_per_day = tmp.groupby(tmp.index.date).count()
    n_days = len(tweets_per_day.index.unique())
    average_pairs.update({''+user+'': ''+str(round(tweets_per_day.user_id.sum(axis=0)/n_days))+''})

pairs_df = pd.DataFrame(average_pairs.items(), columns=['user id', 'tweets per day'])
pairs_df

Unnamed: 0,user id,tweets per day
0,uD0000,3
1,uD0004,6
2,uD0006,3
3,uD0007,7
4,uD0009,5
...,...,...
598,uD1021,4
599,uD1022,4
600,uD1029,6
601,uD1030,4


In [4]:
dt = pd.read_csv("data/depression_tweets.tsv", sep="\t", index_col="tweet_id", parse_dates=[2])
dt.loc[:, 'created_at'] = pd.to_datetime(dt.loc[:, 'created_at'])
dt_users = dt['user_id'].unique()

#get filter values for each individual in depression cohort
dt_stats = []
for j in tqdm(range(0, len(dt_users))):
    current_user = dt_users[j]
    tmp_df = dt[dt['user_id'] == current_user]

    #tweets per day
    current_tpd = pairs_df[pairs_df['user id'] == current_user].iloc[0,1]

    #time between tweets + total time
    start = tmp_df['created_at'].min()
    end = tmp_df['created_at'].max()
    current_tt = (end-start).days
    current_tbt = current_tt/len(tmp_df)

    dt_stats.append({'user': current_user, 'tpd': current_tpd, 'tbt': current_tbt, 'total_time': current_tt})

df_dt_stats = pd.DataFrame(dt_stats)
df_dt_stats.to_csv('dt_stats.csv')

100%|██████████| 603/603 [00:23<00:00, 25.67it/s]


In [5]:
st_pairs = pd.read_csv('pairs_tpd_users.csv')
st.loc[:, 'created_at'] = pd.to_datetime(st.loc[:, 'created_at'])
st_users = st['user_id'].unique()

#get filter values for each individual in depression cohort
st_stats = []
for j in tqdm(range(0, len(st_users))):
    current_user = st_users[j]
    tmp_df = st[st['user_id'] == current_user]

    #tweets per day
    current_tpd = st_pairs[st_pairs['user id'] == current_user].iloc[0,2]

    #time between tweets + total time
    start = tmp_df['created_at'].min()
    end = tmp_df['created_at'].max()
    current_tt = (end-start).days
    current_tbt = current_tt/len(tmp_df)

    st_stats.append({'user': current_user, 'tpd': current_tpd, 'tbt': current_tbt, 'total_time': current_tt})

df_st_stats = pd.DataFrame(st_stats)
df_st_stats.to_csv('.\\tmp_folder\st_stats.csv')

100%|██████████| 7349/7349 [41:21<00:00,  2.96it/s]


OSError: Cannot save file into a non-existent directory: 'tmp_folder'

In [None]:
# finding the closest pairs

#df_dt_stats = pd.read_csv('dt_stats.csv')
#df_st_stats = pd.read_csv('st_stats.csv')
matches = []
used_dt = []

for j in tqdm(range(len(df_dt_stats))):
    current = df_dt_stats.iloc[j]
    current_user = current['user']
    current_tpd = int(current['tpd'])
    current_tbt = int(current['tbt'])
    current_tt = int(current['total_time'])

    closest_user = ''
    distance = 1000000
    for k in range(len(df_st_stats)):
        st_current = df_st_stats.iloc[k]
        st_user = st_current['user']
        st_tpd = int(st_current['tpd'])
        st_tbt = int(st_current['tbt'])
        st_tt = int(st_current['total_time'])

        #calculate distance
        tmp_dis = abs(current_tpd-st_tpd) + abs(current_tbt-st_tbt) + abs(current_tt-st_tt)
        if tmp_dis < distance and st_user not in used_dt:
            distance = tmp_dis
            closest_user = st_user
            used_dt.append(st_user)

    matches.append({'user 1': current_user, 'user 2': closest_user})

df_pair_matching = pd.DataFrame(matches)
df_pair_matching.to_csv('.\\tmp_folder\pair_matching.csv')

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

#df_pair_matching = pd.read_csv('pair_matching.csv')

dt_users = df_pair_matching['user 1'].tolist()
st_users = df_pair_matching['user 2'].tolist()

data = {}
labels = []
for user in dt_users:
    tmp = dt[dt['user_id'] == user]
    tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
    data.update({''+user+'': tweet_list})
    labels.append(1)

for user in st_users:
    tmp = st[st['user_id'] == user]
    tweet_list = tmp['created_at'].map(lambda datetime: int(round(datetime.timestamp()))).to_numpy()
    data.update({''+user+'': tweet_list})
    labels.append(0)

df_data = pd.DataFrame(dict([(key, pd.Series(value)) for key, value in data.items()]))
df_labels = pd.DataFrame(labels)

df_data = df_data.transpose()
#df_data = df_data.drop(['Unnamed: 0'])
#df_labels = df_labels.drop(columns=['Unnamed: 0'])
df_data.index = range(0, len(df_data.index))
df_data.sort_index()
df_data = df_data.fillna(0)
y_train, y_test, X_train, X_test = temporal_train_test_split(df_labels,df_data, test_size=0.2)

#run ml and safe result with filters
#rocket = Rocket()
#rocket.fit(X_train)
##X_train_transform = rocket.transform(X_train)
#classifier = RidgeClassifierCV(alphas=np.logspace(-3,3,10))
#classifier.fit(X_train, y_train)
##print(X_test)
##X_test_transform = rocket.transform(X_test)
#X_test = X_test.replace(-float('inf'), -sys.float_info.min)
#score = classifier.score(X_test, y_test)

classifier_pipe = make_pipeline(StandardScaler(), SVC())
classifier_pipe.fit(X_train, y_train)
score = classifier_pipe.score(X_test, y_test)
print(score)

In [33]:
score

0.5289256198347108