In [2]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile, join

In [3]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 199)
pd.options.display.float_format = '{:.2f}'.format

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
CLEAN_TWEETS_READ_PATH = r"/arc/project/st-tlemieux-1/data/Cleaned-Labeled-Tweets/"

MERGED_TWEET_CHUNK_SAVE_PATH = r"/scratch/st-tlemieux-1/lfrymire/BERT-input/FINAL-INPUTS/ALL-TWEETS/" #this should be a folder containing three folders - one for each type of CT tweet (make sure it ends in a slash)

# make subfolders:
for type_of_tweets in ["GEN-CT-Tweets", "COV-CT-Tweets", "NON-COV-CT-Tweets"]:
    
    if not os.path.exists(MERGED_TWEET_CHUNK_SAVE_PATH + type_of_tweets + '/'):
        os.makedirs(MERGED_TWEET_CHUNK_SAVE_PATH + type_of_tweets + '/')

In [4]:
def get_files_in_folder(folder):
    
    return [f for f in listdir(folder) if isfile(join(folder, f))]

In [None]:
users = get_files_in_folder(CLEAN_TWEETS_READ_PATH)

In [None]:
cols_to_use = ['COVID-SPECIFIC CT Tweet (Dummy)', 'CT Tweet (Dummy)',
       'NON-COVID CT Tweet (Dummy)', 'date',
       'language', 'hashtags', 'urls', 'retweet']

bool_cols = ['COVID-SPECIFIC CT Tweet (Dummy)', 'CT Tweet (Dummy)','NON-COVID CT Tweet (Dummy)', 'retweet']
str_cols = ['date', 'Cleaned Tweet', 'hashtags', 'urls']

obj_cols = ['language']

bool_cols_dict = {col:np.bool_ for col in bool_cols}
str_cols_dict = {col:str for col in str_cols}
obj_cols_dict = {col:str for col in obj_cols}

datatypes = {**bool_cols_dict, **str_cols_dict, **obj_cols_dict}

In [None]:
type_to_label_dict = {"GEN-CT-Tweets":'CT Tweet (Dummy)',
                      "COV-CT-Tweets":'COVID-SPECIFIC CT Tweet (Dummy)',
                      "NON-COV-CT-Tweets": 'NON-COVID CT Tweet (Dummy)'}
               
def read_and_process_df(USER_ID, READ_PATH, type_of_tweet, bad_list):
    try:
    
        df = pd.read_csv(READ_PATH, usecols=cols_to_use, dtype=datatypes)

        df['user_id'] = USER_ID
        
        df = df[df['language']==1]

        return df
    
    except:
        
        bad_list.append(USER_ID)

In [None]:
NUM_CHUNKS = 1000

user_chunk_splits = np.array_split(users, NUM_CHUNKS)

unreadable_users = []

for type_of_tweets in ["COV-CT-Tweets"]: #["GEN-CT-Tweets", "COV-CT-Tweets", "NON-COV-CT-Tweets"]:

    for i, chunk in enumerate(range(0,NUM_CHUNKS)):
        
        if os.path.exists(fr"{MERGED_TWEET_CHUNK_SAVE_PATH}{type_of_tweets}/MERGED_CHUNK_{i}"):
            continue

        chunk_read_paths = [CLEAN_TWEETS_READ_PATH + user for user in user_chunk_splits[i]]

        users_and_paths = zip(user_chunk_splits[i], chunk_read_paths)
        
#         list_of_user_tweet_dfs = [read_and_process_df(USER_ID=user_path_tup[0],
#                                                     READ_PATH=user_path_tup[1],
#                                                     type_of_tweet=type_of_tweets,
#                                                     bad_list=unreadable_users) for user_path_tup in users_and_paths]
#         pd.concat(list_of_user_tweet_dfs).rename(columns={'Cleaned Tweet':'sentence1', type_to_label_dict[type_of_tweet]:'label'}).to_csv(fr"{MERGED_TWEET_CHUNK_SAVE_PATH}{type_of_tweets}/MERGED_CHUNK_{i}", index=False)

    
        pd.concat([read_and_process_df(USER_ID=user_path_tup[0],
                                                    READ_PATH=user_path_tup[1],
                                                    type_of_tweet=type_of_tweets,
                                                    bad_list=unreadable_users) for user_path_tup in users_and_paths]).rename(columns={'Cleaned Tweet':'sentence1', type_to_label_dict[type_of_tweet]:'label'}).to_csv(fr"{MERGED_TWEET_CHUNK_SAVE_PATH}{type_of_tweets}/MERGED_CHUNK_{i}", index=False)

# TRASH HERE

In [76]:
# cols_to_use = ['COVID-SPECIFIC CT Tweet (Dummy)', 'CT Tweet (Dummy)',
#        'NON-COVID CT Tweet (Dummy)', 'date', 'Cleaned Tweet',
#        'language', 'retweet']

# bool_cols = ['COVID-SPECIFIC CT Tweet (Dummy)', 'CT Tweet (Dummy)','NON-COVID CT Tweet (Dummy)', 'retweet']
# # str_cols = ['date', 'Cleaned Tweet']
# str_cols = ['Cleaned Tweet']


# obj_cols = ['language']

# bool_cols_dict = {col:np.bool_ for col in bool_cols}
# str_cols_dict = {col:str for col in str_cols}
# obj_cols_dict = {col:str for col in obj_cols}

# datatypes = {**bool_cols_dict, **str_cols_dict, **obj_cols_dict}

In [77]:
# %timeit -n 20 pd.read_csv(r"C:\Users\mikha\Dropbox\mikhael_misc\Projects\594\Twitter-Conspiracies\BERT\1123987267637399554_CLEANED_TWEETS.csv", usecols=cols_to_use, dtype=datatypes)

9.13 ms ± 448 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [78]:
# %timeit -n 20 pd.read_csv(r"C:\Users\mikha\Dropbox\mikhael_misc\Projects\594\Twitter-Conspiracies\BERT\1123987267637399554_CLEANED_TWEETS.csv")

14.7 ms ± 660 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [97]:
# %timeit -n 20 pd.read_csv(r"C:\Users\mikha\Dropbox\mikhael_misc\Projects\594\Twitter-Conspiracies\BERT\1123987267637399554_CLEANED_TWEETS.csv", usecols=cols_to_use, dtype=datatypes, parse_dates=['date'], date_parser=get_date)

12.8 ms ± 1.5 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [108]:
# def get_date(s):
#     Y,M,D = int(s[0:4]), int(s[5:7]), int(s[8:10])
#     return datetime.date(Y,M,D)

# def get_date2(s):
#     return datetime.date(int(s[0:4]), int(s[5:7]), int(s[8:10]))

In [109]:
# %timeit -n 20 pd.read_csv(r"C:\Users\mikha\Dropbox\mikhael_misc\Projects\594\Twitter-Conspiracies\BERT\1123987267637399554_CLEANED_TWEETS.csv", usecols=cols_to_use, dtype=datatypes, parse_dates=['date'], date_parser=get_date2)

12.2 ms ± 801 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [96]:
# get_date(sample['date'].iloc[0])

datetime.date(2021, 6, 26)