In [88]:
import os
import sys
import multiprocessing
from itertools import islice
from random import randint

import pandas as pd
import numpy as np
from scipy import stats
from scipy import signal 
from statsmodels import robust
import pickle

from sklearn.manifold import TSNE

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

# import tensorflow as tf 
# from tensorflow.keras.utils import to_categorical

from metrics_ import eval_regression, eval_classification, ccc, pcc, accuracy, precision, recall, f1score, auc_roc, cohen_kappa, mcc # import custom evaluation metrics 
from sklearn.metrics import confusion_matrix, classification_report

import joblib 

# custom import 
import model_
import metrics_

## Load dataset 

In [89]:
antiMask_df = pd.read_csv("../data/BERTTweet_AntiMask_Features/April_NoMask_Tweets.csv")

proMask_df = pd.read_csv("../data/BERTWeet_ProMask_Features/April_WearMask_Tweets.csv")

In [90]:
proMask_df.shape
print(antiMask_df.shape)

(195, 768)


In [91]:
antiMask_df["ground_truth"] = [0] * antiMask_df.shape[0]
proMask_df["ground_truth"] = [1] * proMask_df.shape[0]

m_df = pd.concat([antiMask_df, proMask_df])
m_df["month"] = ["April"] * m_df.shape[0]
print(m_df.shape)

(414, 770)


In [92]:
def train_n_rand_validate_model(df, features, model_type):
    # count_ = 0
    n_seed = 5
    m_res_mat = np.zeros((0, 10)) # 8 is subject to change 
    for seed_ in range(n_seed): # validation 4 times 
        # train_df = df[df["subject"].isin(fold) == False]
        # test_df = df[df["subject"].isin(fold) == True]

        # count_ += 1 
        train_df, test_df, y_train_vec, y_test_vec = train_test_split(df, df["ground_truth"], 
                                                                    test_size=0.3, # 0.3, 0.95
                                                                    random_state=seed_, 
                                                                    stratify=df["month"])


        X_train, y_train = train_df[features], train_df["ground_truth"]
        X_test, y_test = test_df[features], test_df["ground_truth"]

        print("Number of features used in model training:", X_train.shape)

        # train model 
        if model_type == "rf":
            X_train = X_train.astype(float)
            X_test = X_test.astype(float)
#             X_train, X_test = min_max_norm(X_train, X_test, "")

            model = model_.xgboost_classifier(X_train, y_train)
    
        pred = model.predict(X_test)
        orig = y_test


       	clf_report = classification_report(orig, pred, output_dict=True)
       	print(clf_report)
        cls_report_df = cls_report_dict2mat(clf_report)
        print(cls_report_df)

    return cls_report_df


def cls_report_dict2mat(cls_report_dict):
    """
    columns -> precision, recall, f1-score, and support
    row class, accuracy, ... 
    """

    return pd.DataFrame(cls_report_dict)

In [32]:
print(m_df.columns.tolist())

['BERTWeet_1', 'BERTWeet_2', 'BERTWeet_3', 'BERTWeet_4', 'BERTWeet_5', 'BERTWeet_6', 'BERTWeet_7', 'BERTWeet_8', 'BERTWeet_9', 'BERTWeet_10', 'BERTWeet_11', 'BERTWeet_12', 'BERTWeet_13', 'BERTWeet_14', 'BERTWeet_15', 'BERTWeet_16', 'BERTWeet_17', 'BERTWeet_18', 'BERTWeet_19', 'BERTWeet_20', 'BERTWeet_21', 'BERTWeet_22', 'BERTWeet_23', 'BERTWeet_24', 'BERTWeet_25', 'BERTWeet_26', 'BERTWeet_27', 'BERTWeet_28', 'BERTWeet_29', 'BERTWeet_30', 'BERTWeet_31', 'BERTWeet_32', 'BERTWeet_33', 'BERTWeet_34', 'BERTWeet_35', 'BERTWeet_36', 'BERTWeet_37', 'BERTWeet_38', 'BERTWeet_39', 'BERTWeet_40', 'BERTWeet_41', 'BERTWeet_42', 'BERTWeet_43', 'BERTWeet_44', 'BERTWeet_45', 'BERTWeet_46', 'BERTWeet_47', 'BERTWeet_48', 'BERTWeet_49', 'BERTWeet_50', 'BERTWeet_51', 'BERTWeet_52', 'BERTWeet_53', 'BERTWeet_54', 'BERTWeet_55', 'BERTWeet_56', 'BERTWeet_57', 'BERTWeet_58', 'BERTWeet_59', 'BERTWeet_60', 'BERTWeet_61', 'BERTWeet_62', 'BERTWeet_63', 'BERTWeet_64', 'BERTWeet_65', 'BERTWeet_66', 'BERTWeet_67', 'BE

In [41]:
features = ["BERTWeet_" + str(i) for i in range(1, 769)]
# print(features)

res_df = train_n_rand_validate_model(m_df, features, "rf")

Number of features used in model training: (289, 768)
{'0': {'precision': 0.9166666666666666, 'recall': 0.9016393442622951, 'f1-score': 0.9090909090909091, 'support': 61}, '1': {'precision': 0.9076923076923077, 'recall': 0.921875, 'f1-score': 0.9147286821705427, 'support': 64}, 'accuracy': 0.912, 'macro avg': {'precision': 0.9121794871794872, 'recall': 0.9117571721311475, 'f1-score': 0.9119097956307258, 'support': 125}, 'weighted avg': {'precision': 0.9120717948717949, 'recall': 0.912, 'f1-score': 0.9119774489076815, 'support': 125}}
                   0          1  accuracy   macro avg  weighted avg
precision   0.916667   0.907692     0.912    0.912179      0.912072
recall      0.901639   0.921875     0.912    0.911757      0.912000
f1-score    0.909091   0.914729     0.912    0.911910      0.911977
support    61.000000  64.000000     0.912  125.000000    125.000000
Number of features used in model training: (289, 768)
{'0': {'precision': 0.9454545454545454, 'recall': 0.88135593220338

In [42]:
res_df


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.966667,0.953846,0.96,0.960256,0.960103
recall,0.95082,0.96875,0.96,0.959785,0.96
f1-score,0.958678,0.96124,0.96,0.959959,0.95999
support,61.0,64.0,0.96,125.0,125.0


In [43]:
m_df.head()


Unnamed: 0,BERTWeet_1,BERTWeet_2,BERTWeet_3,BERTWeet_4,BERTWeet_5,BERTWeet_6,BERTWeet_7,BERTWeet_8,BERTWeet_9,BERTWeet_10,...,BERTWeet_761,BERTWeet_762,BERTWeet_763,BERTWeet_764,BERTWeet_765,BERTWeet_766,BERTWeet_767,BERTWeet_768,ground_truth,month
0,0.142188,0.048057,0.03179,-0.036486,0.195182,0.380576,0.002807,0.060406,-0.122444,-0.070154,...,0.194742,-0.301981,-0.084923,-0.078743,-0.089278,-0.237864,0.054575,0.087381,0,April
1,0.225371,-0.090585,0.054919,-0.016016,0.13858,0.499586,0.080081,0.019572,-0.211389,-0.057985,...,0.320208,-0.165865,-0.094728,-0.070827,-0.110878,-0.271292,0.022292,0.012486,0,April
2,0.206832,-0.078581,0.066508,-0.026571,0.156235,0.483117,0.049465,0.061272,-0.198823,-0.059546,...,0.287926,-0.22077,-0.072641,-0.041031,-0.076787,-0.247851,-0.03316,0.05044,0,April
3,0.192408,-0.00946,0.164538,0.083162,0.14031,0.424962,-0.023377,0.068898,-0.145772,0.043453,...,0.128669,-0.278425,-0.068543,-0.113591,-0.141383,-0.218738,0.023755,0.047479,0,April
4,0.201262,0.020471,0.158864,0.075687,0.152396,0.427829,-0.052581,0.063467,-0.196136,0.038774,...,0.15298,-0.262548,-0.123953,-0.063009,-0.12804,-0.236646,-0.027321,0.112876,0,April


## raw files


In [44]:
pro_n_anti_masks_df = pd.read_csv("../data/stack_files/balanced_pro_n_anti_mask_df.csv")

In [45]:
# get april tweets 
pro_n_anti_masks_df_v2 = pro_n_anti_masks_df[pro_n_anti_masks_df["month"] == "April"]

In [46]:
pro_n_anti_masks_df_v2.shape

(241, 19)

In [47]:
pro_n_anti_masks_df["month"].unique().tolist()

['September',
 'May',
 'April',
 'June',
 'March',
 'November',
 'August',
 'October',
 'July']

In [77]:
proMask_ID = pro_n_anti_masks_df_v2[pro_n_anti_masks_df_v2["ground_truth"] == 1]["ID"]
antiMask_ID = pro_n_anti_masks_df_v2[pro_n_anti_masks_df_v2["ground_truth"] == 0]["ID"]

In [84]:
antiMask_df["ID"] = antiMask_ID.tolist()
# proMask_df["ID"] = proMask_ID.tolist()

In [85]:
antiMask_df.shape

(195, 769)

In [86]:
antiMask_ID.shape

(195,)

In [87]:
antiMask_df.iloc[:3, 760:]

Unnamed: 0,BERTWeet_761,BERTWeet_762,BERTWeet_763,BERTWeet_764,BERTWeet_765,BERTWeet_766,BERTWeet_767,BERTWeet_768,ID
0,0.194742,-0.301981,-0.084923,-0.078743,-0.089278,-0.237864,0.054575,0.087381,0_April_0000001
1,0.320208,-0.165865,-0.094728,-0.070827,-0.110878,-0.271292,0.022292,0.012486,0_April_0000002
2,0.287926,-0.22077,-0.072641,-0.041031,-0.076787,-0.247851,-0.03316,0.05044,0_April_0000003


In [71]:
proMask_df.shape

(219, 769)

In [72]:
proMask_df

Unnamed: 0,BERTWeet_1,BERTWeet_2,BERTWeet_3,BERTWeet_4,BERTWeet_5,BERTWeet_6,BERTWeet_7,BERTWeet_8,BERTWeet_9,BERTWeet_10,...,BERTWeet_760,BERTWeet_761,BERTWeet_762,BERTWeet_763,BERTWeet_764,BERTWeet_765,BERTWeet_766,BERTWeet_767,BERTWeet_768,ID
0,0.071437,-0.100726,0.102988,-0.028361,0.126682,0.434285,-0.010150,-0.019423,-0.164616,-0.052037,...,0.131914,0.209260,-0.171148,-0.076508,-0.046731,-0.077347,-0.240647,-0.006091,0.133510,
1,0.210390,-0.055628,0.049902,0.018345,0.226813,0.404962,-0.047979,0.100989,-0.134225,0.020933,...,0.148774,0.265777,-0.259552,-0.097490,-0.004045,-0.102579,-0.297604,0.031886,0.113123,
2,0.156704,0.013002,-0.011852,0.001090,0.213118,0.428469,0.002115,0.115488,-0.081904,0.001959,...,0.262819,0.198366,-0.239077,-0.129620,-0.106067,-0.097036,-0.280181,-0.037167,0.153912,
3,0.154652,-0.159348,0.025991,0.007464,0.061832,0.439511,0.061781,0.048084,-0.159025,0.016962,...,0.199038,0.287021,-0.281578,-0.064530,-0.022120,-0.059693,-0.297258,0.040906,0.088476,
4,0.128074,-0.013413,0.019466,0.019475,0.166905,0.495404,0.006263,0.076829,-0.224596,0.063607,...,0.191079,0.226248,-0.285532,-0.080705,0.004713,-0.015951,-0.357933,0.025337,0.100046,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,0.153814,-0.081657,0.016710,0.026496,0.212489,0.501891,0.044370,0.125191,-0.186947,0.017996,...,0.154749,0.282559,-0.198415,-0.146451,-0.031101,-0.130268,-0.246979,0.011169,0.144383,
215,0.206512,-0.053801,0.027208,-0.059864,0.188722,0.435567,-0.049499,0.144703,-0.179089,0.021713,...,0.193924,0.215753,-0.283091,-0.211325,-0.027597,-0.051916,-0.325179,0.040851,0.103828,
216,0.185589,-0.099425,-0.019123,-0.039940,0.183366,0.483113,-0.006802,0.072486,-0.235371,-0.004751,...,0.165921,0.240426,-0.213226,-0.040271,-0.041373,-0.051637,-0.204066,0.000002,0.056951,
217,0.086036,-0.070756,0.019359,0.019545,0.179126,0.496804,0.060463,0.088909,-0.179757,0.004075,...,0.176351,0.238674,-0.245641,-0.113017,0.016793,-0.116160,-0.304806,0.026778,0.086565,


In [73]:
proMask_df.dropna()

Unnamed: 0,BERTWeet_1,BERTWeet_2,BERTWeet_3,BERTWeet_4,BERTWeet_5,BERTWeet_6,BERTWeet_7,BERTWeet_8,BERTWeet_9,BERTWeet_10,...,BERTWeet_760,BERTWeet_761,BERTWeet_762,BERTWeet_763,BERTWeet_764,BERTWeet_765,BERTWeet_766,BERTWeet_767,BERTWeet_768,ID


In [96]:
m_df.to_csv("../data/stack_files/samp_BERT_df_april.csv", index=False, header=True)

In [97]:
pro_n_anti_masks_df = pd.read_csv("../data/stack_files/pro_n_anti_mask_df.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [98]:
pro_n_anti_masks_df.shape

(47013, 19)

In [99]:
samp_df = pro_n_anti_masks_df[pro_n_anti_masks_df["month"] == "April"]

In [100]:
samp_df.shape

(414, 19)

In [101]:
samp_df.to_csv("../data/stack_files/samp_raw_df_april.csv", index=False, header=True)