In [1]:
import os
import sys
import multiprocessing
from itertools import islice
from random import randint

import pandas as pd
import numpy as np
from scipy import stats
from scipy import signal 
from statsmodels import robust
import pickle

from sklearn.manifold import TSNE

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

# import tensorflow as tf 
# from tensorflow.keras.utils import to_categorical

from metrics_ import eval_regression, eval_classification, ccc, pcc, accuracy, precision, recall, f1score, auc_roc, cohen_kappa, mcc # import custom evaluation metrics 
from sklearn.metrics import confusion_matrix, classification_report

import joblib 

# custom import 
import model_
import metrics_



## Load dataset 

In [None]:
antiMask_df = pd.read_csv("../data/BERTTweet_AntiMask_Features/April_NoMask_Tweets.csv")

proMask_df = pd.read_csv("../data/BERTWeet_ProMask_Features/April_WearMask_Tweets.csv")

In [None]:
proMask_df.shape
print(antiMask_df.shape)

In [None]:
antiMask_df["ground_truth"] = [0] * antiMask_df.shape[0]
proMask_df["ground_truth"] = [1] * proMask_df.shape[0]

m_df = pd.concat([antiMask_df, proMask_df])
m_df["month"] = ["April"] * m_df.shape[0]
print(m_df.shape)

In [None]:
def train_n_rand_validate_model(df, features, model_type):
    # count_ = 0
    n_seed = 5
    m_res_mat = np.zeros((0, 10)) # 8 is subject to change 
    for seed_ in range(n_seed): # validation 4 times 
        # train_df = df[df["subject"].isin(fold) == False]
        # test_df = df[df["subject"].isin(fold) == True]

        # count_ += 1 
        train_df, test_df, y_train_vec, y_test_vec = train_test_split(df, df["ground_truth"], 
                                                                    test_size=0.3, # 0.3, 0.95
                                                                    random_state=seed_, 
                                                                    stratify=df["month"])


        X_train, y_train = train_df[features], train_df["ground_truth"]
        X_test, y_test = test_df[features], test_df["ground_truth"]

        print("Number of features used in model training:", X_train.shape)

        # train model 
        if model_type == "rf":
            X_train = X_train.astype(float)
            X_test = X_test.astype(float)
#             X_train, X_test = min_max_norm(X_train, X_test, "")

            model = model_.xgboost_classifier(X_train, y_train)
    
        pred = model.predict(X_test)
        orig = y_test


       	clf_report = classification_report(orig, pred, output_dict=True)
       	print(clf_report)
        cls_report_df = cls_report_dict2mat(clf_report)
        print(cls_report_df)

    return cls_report_df


def cls_report_dict2mat(cls_report_dict):
    """
    columns -> precision, recall, f1-score, and support
    row class, accuracy, ... 
    """

    return pd.DataFrame(cls_report_dict)

In [None]:
print(m_df.columns.tolist())

In [None]:
features = ["BERTWeet_" + str(i) for i in range(1, 769)]
# print(features)

res_df = train_n_rand_validate_model(m_df, features, "rf")

In [None]:
res_df


In [None]:
m_df.head()


## raw files


In [None]:
pro_n_anti_masks_df = pd.read_csv("../data/stack_files/balanced_pro_n_anti_mask_df.csv")

In [None]:
# get april tweets 
pro_n_anti_masks_df_v2 = pro_n_anti_masks_df[pro_n_anti_masks_df["month"] == "April"]

In [None]:
pro_n_anti_masks_df_v2.shape

In [None]:
pro_n_anti_masks_df["month"].unique().tolist()

In [None]:
proMask_ID = pro_n_anti_masks_df_v2[pro_n_anti_masks_df_v2["ground_truth"] == 1]["ID"]
antiMask_ID = pro_n_anti_masks_df_v2[pro_n_anti_masks_df_v2["ground_truth"] == 0]["ID"]

In [None]:
antiMask_df["ID"] = antiMask_ID.tolist()
# proMask_df["ID"] = proMask_ID.tolist()

In [None]:
antiMask_df.shape

In [None]:
antiMask_ID.shape

In [None]:
antiMask_df.iloc[:3, 760:]

In [None]:
proMask_df.shape

In [None]:
proMask_df

In [None]:
proMask_df.dropna()

In [None]:
m_df.to_csv("../data/stack_files/samp_BERT_df_april.csv", index=False, header=True)

In [None]:
pro_n_anti_masks_df = pd.read_csv("../data/stack_files/pro_n_anti_mask_df.csv")

In [None]:
pro_n_anti_masks_df.shape

In [None]:
samp_df = pro_n_anti_masks_df[pro_n_anti_masks_df["month"] == "April"]

In [None]:
samp_df.shape

In [None]:
samp_df.to_csv("../data/stack_files/samp_raw_df_april.csv", index=False, header=True)

## Cross fold validation 

In [94]:
import os
import sys
import multiprocessing
from itertools import islice
from random import randint

import pandas as pd
import numpy as np
from scipy import stats
from scipy import signal 
from statsmodels import robust
import pickle

from sklearn.manifold import TSNE

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

# import tensorflow as tf 
# from tensorflow.keras.utils import to_categorical

from metrics_ import eval_regression, eval_classification, ccc, pcc, accuracy, precision, recall, f1score, auc_roc, cohen_kappa, mcc # import custom evaluation metrics 
from sklearn.metrics import confusion_matrix, classification_report

import joblib 

# custom import 
import model_
import metrics_



def train_n_cv_valid_models(df, features, flag):
    folds = df.folds.unique().tolist()
    print(folds)

    res_lst = []
    fold_count = 0
    m_res_mat = np.zeros((0, 10)) # 8 is subject to change 
    for fold in folds:
        train_df = df[df["folds"].isin([fold]) == False]
        test_df = df[df["folds"].isin([fold]) == True]
        
        print(train_df.shape)
        
        # run experiment for classification with BERT features 
        if flag == "XGB":
            model = model_.xgboost_classifier(train_df[features].astype(np.float64), train_df["ground_truth"])
            pred = model.predict(test_df[features])

            clf_report = classification_report(test_df["ground_truth"], pred, output_dict=True)
            cls_report_df = cls_report_dict2mat(clf_report)
            print(cls_report_df)
        
                
    return 0 


# def fit_model(df, df2, features, flag):
#     folds = df.folds.unique().tolist()
#     print(folds)

#     res_lst = []
#     fold_count = 0
#     m_res_mat = np.zeros((0, 10)) # 8 is subject to change 
#     for fold in folds:
#         train_df = df[df["folds"].isin([fold]) == False]
#         test_df = df[df["folds"].isin([fold]) == True]
        
#         print(train_df.shape)
        
#         if flag == "TF-IDF":
#             TF_IDF_df = get_TF_IDF_mat(list_of_tweets, list_of_IDs) 
        
# #         # run LDA 
# #         if flag == "LDA":
# #             col_name = "Tweet Text" # Data must be matrix 
# #             n_comps = 10
# #             X_train, X_test = model_.latent_da_v2(train_df[col_name], test_df[col_name], n_comps) 
# #             print(X_train.shape, X_test.shape)
        
        
#         # run K-means 
        
        
#         # run TF-IDF 
        
        
#     return 0 


def cls_report_dict2mat(cls_report_dict):
    """
    columns -> precision, recall, f1-score, and support
    row class, accuracy, ... 
    """

    return pd.DataFrame(cls_report_dict)


df.shape

(16317, 29)

In [103]:
# dummy dataset 
data_dir = "../data/stack_files/"
# filename = "samp_raw_df_april_v2.csv"
filename = "balanced_pro_n_anti_mask_df_v3.csv"

df = pd.read_csv(os.path.join(data_dir, filename), delimiter=",")

# df = df[df["ground_truth"].isin(["0", "1"]) == True]


# BERT file 
filename = "samp_BERT_df_april.csv"
BERT_df = pd.read_csv(os.path.join(data_dir, filename))



# def fix_comma(df):
#     df['Tweet Text'] = [item.replace(',', '') for item in df['Tweet Text']]
            
#     return df.copy()


# filename = "balanced_pro_n_anti_mask_df_v3.csv"
# df = fix_comma(df)

# df.to_csv(os.path.join(data_dir, filename), index=False, header=True)

# L = df.iloc[:, 0]

In [102]:
new_col_df = df.iloc[:, 24:].copy()

# new_col_df.iloc[510:515, 
new_col_df.columns


# total = 0 
# c = [24, 25, 26, 27, 28]
# for i in range(5):
    
    
# X = new_col_df[new_col_df[["Unnamed: 24"]].notna()]

# # X.shape
# new_col_df.head()


# col_names = ["Unnamed: 24", "Unnamed: 25" "Unnamed: 26", "amed: 24", "Unnamed: 24"]

X = new_col_df[new_col_df[["Unnamed: 24"]].isna()]

X.shape

(16317, 5)

In [None]:
features = ["BERTWeet_" + str(i) for i in range(1, 769)] + ["month"]

merged_df = pd.merge(df, BERT_df[features], on=["month"], sort=True)
# res_df = train_n_cv_valid_models(df, features, "XGB")

In [80]:
X = [item.replace(',', '') for item in df['Tweet Text']]

X[513]


len(['4', '2', '1', '3', '0_September_0000162', '0', '0_September_0000344', '0_September_0000419', '0_September_0000509', '0_September_0000512', '0_September_0000580', '0_September_0000956', '0_September_0000964', '0_September_0001009', '0_May_0000048', '0_May_0000052', '0_May_0000068', 'May', '0_May_0000100', '0_May_0000263', '0_May_0000333', '0_May_0000368', '0_May_0000450', '0_April_0000088', nan, '0_April_0000156', '0_June_0000008', 'June', '0_June_0000126', '0_June_0000472', '0_June_0000479', '0_June_0000493', '0_June_0000556', '0_November_0000058', '0_November_0000134', '0_November_0000193', '0_November_0000204', '0_November_0000225', '0_August_0000208', '0_August_0000324', '0_August_0000553', '0_August_0000708', '0_August_0000788', '0_August_0000817', '0_August_0000836', '0_August_0000860', '0_August_0000935', '0_August_0000955', 'August', '0_October_0000297', '0_October_0000330', '0_October_0000437', '0_October_0000609', '0_October_0000696', '0_October_0000726', '0_October_0000774', '0_October_0000872', '0_October_0000918', '0_October_0000928', '0_October_0001048', '0_October_0001057', '0_October_0001074', '0_October_0001100', '0_July_0000005', '0_July_0000012', '0_July_0000169', '0_July_0000380', '0_July_0000435', '0_July_0000438', 'July', '0_July_0000688', '0_July_0001029', '0_July_0001108', '0_July_0001238', '0_July_0001257', '0_July_0001335', '0_July_0001372', '0_July_0001519', '0_July_0001523', '0_July_0001723', '0_July_0001736', '0_July_0001848', '0_July_0001880', '0_July_0001885', '0_July_0001983', '0_July_0002019', '0_July_0002064', '0_July_0002397', '0_July_0002521', '0_July_0002547', '0_July_0002579', '0_July_0002586', '0_July_0002631', '0_July_0002640', '0_July_0002832', '0_July_0002867', '0_July_0002878', '0_July_0002889', '0_July_0002890', '0_July_0002891', '0_July_0002892', '0_July_0002901', '0_July_0002906', '0_July_0002927', '0_July_0002950', '1_July_0000933', '1_July_0000299', '1_July_0005034', '1_October_0005399', '1_March_0000295', '1_July_0004960', '1_May_0000502', '1_September_0002105', '1_July_0004670', '1_July_0004363', '1_June_0002286', '1_July_0005927', '1_October_0000659', '1.16576196303879E+018', '1_August_0000615', '1_May_0000644', 'September', '1_August_0003827', '1_June_0004213', '1_April_0000211', '1_August_0000976', '1_June_0002302', '1_May_0000651', '1_June_0004346', '1_July_0001511', '1_August_0001383', '1_July_0001014', '1_July_0007389', '1_August_0001782', '1_August_0003814', '1_June_0005122', '1_September_0001351', '1_July_0000605', '1_August_0003438', '1_July_0002666', '1_May_0001210', '1_September_0006399', '1_October_0000173', '1_July_0000881', '1_July_0008506', '1_August_0002963', '1_November_0000740', '1_October_0001429', '1_July_0006108', '1_July_0004401', '1_June_0000344', '1_July_0001362', '1_May_0000987', '1_August_0002512', '1_July_0009167', '1_June_0004871', '1_July_0001357', '1_November_0000557', '1_October_0004640', '1_June_0001396', '1_July_0005334', '1_July_0005819', '1_October_0004375', '1_June_0005858', '1_September_0004685', '1_September_0006156', '1_May_0000622', '1_September_0006070', '1_July_0006718', '1_July_0001352', '1_July_0002727', '1_October_0003409', '1_June_0001409', '1_August_0005453', '1_June_0001731', '1_October_0001430', '1_July_0001402', '1_October_0003705', '1_June_0004080', '1_June_0005354', '1_August_0002428', '1_June_0000100', '1_October_0003437', '1_July_0010062', '1_July_0008562', '1_May_0000591', '1_June_0000440', '1_June_0003892', '1_July_0000396', '1_August_0003851', '1_July_0000955', '1_September_0004692', '1_October_0000330', '1_July_0001057', '1_June_0003660', '1_July_0002731', '1_September_0006069', '1_May_0001626', '1_August_0001995', '1_June_0000441', '1_July_0001119', '1_November_0000151', '1_July_0003462', '1_October_0000575', '1_April_0000036', '1_August_0003693', '1_November_0000396', '1_October_0002513', '1_July_0001232', '1_November_0000215', '1_November_0000832', '1_May_0001433', '1_July_0009917', '1_July_0000114', '1_July_0009501', '1_July_0004420', '1_July_0000084', '1_August_0001899', '1_March_0000393', '1_July_0004450', '1_July_0004320', '1_November_0000326', '1_October_0000555', '1_October_0003354', '1_June_0004377', '1_August_0001402', '1_July_0002681', '1_September_0006384', '1_June_0004806', '1_July_0005206', '1_September_0000376', '1_August_0001415', '1_September_0005777', '1_October_0000351'])

NameError: name 'nan' is not defined

In [104]:
# df2 = df.iloc[:, :24]
# df2.shape

# df = df2.copy()
# df = df.iloc[:, :24]
intp_features = ["Retweets", "Favorites", 'vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']


_ = train_n_cv_valid_models(df, intp_features, "XGB")

[4, 2, 1, 3]
(12008, 29)
                     0            1  accuracy    macro avg  weighted avg
precision     0.668709     0.713120   0.68775     0.690914      0.690692
recall        0.756436     0.617677   0.68775     0.687056      0.687750
f1-score      0.709872     0.661976   0.68775     0.685924      0.686163
support    2020.000000  1980.000000   0.68775  4000.000000   4000.000000
(12001, 29)
                     0            1  accuracy    macro avg  weighted avg
precision     0.669319     0.750143  0.704517     0.709731      0.710810
recall        0.776410     0.636364  0.704517     0.706387      0.704517
f1-score      0.718898     0.688585  0.704517     0.703742      0.703337
support    1950.000000  2057.000000  0.704517  4007.000000   4007.000000
(12005, 29)
                     0            1  accuracy    macro avg  weighted avg
precision     0.679940     0.678034  0.678991     0.678987      0.678990
recall        0.680279     0.677694  0.678991     0.678987      0.678991
f1

In [55]:
df.shape

(16028, 24)

In [None]:
df.dtypes

In [None]:
x = df.iloc[163, :]

In [None]:
print(x)

In [None]:
intp_features = ['Retweets', 'Favorites', 'vader_neg', 'vader_compound']

X = df[intp_features]

In [None]:
df.columns


In [None]:
intp_features = ['Retweets', 'Favorites', 'vader_neg', 'vader_compound']

In [None]:
intp_features

In [None]:
X.dtypes

In [21]:
x = int(df.iloc[420, 11])

In [22]:
print(x)

81


In [23]:
df3 = df.copy()

In [53]:
df2.iloc[510:514, 0]

517    Top of the world. #SocialDistancing #nomask ht...
518    @ZubyMusic Masks don't work. My own mum who wo...
519    This is something worth sharing. If the selfis...
520    @GovBillLee Dr. Scott Atlas,Advisor to the Pre...
Name: Tweet Text, dtype: object

In [27]:
print(df.iloc[:, 11].astype(int))

ValueError: invalid literal for int() with base 10: "['scamdemic2020', 'NoMasks']"

In [32]:
df3 = df[df["ground_truth"].isin(["0", "1"]) == True]

In [33]:
df3.shape

(16028, 24)

In [34]:
df.shape

(16317, 24)

In [35]:
16317 - 16028


289

In [37]:
df[df["ground_truth"].isin(["1"]) == True].shape[0]

7997

In [38]:
df[df["ground_truth"].isin(["0"]) == True].shape[0]

8031

In [39]:
7997 / 8031


0.9957664051799278