In [None]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile, join

In [None]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 199)
pd.options.display.float_format = '{:.2f}'.format

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
CLEAN_TWEETS_READ_PATH = r"/arc/project/st-tlemieux-1/data/Cleaned-Labeled-Tweets/"

MERGED_TWEET_CHUNK_SAVE_PATH = r"/scratch/st-tlemieux-1/lfrymire/BERT-input/FINAL-INPUTS/ALL-TWEETS/" #this should be a folder containing three folders - one for each type of CT tweet (make sure it ends in a slash)

# make subfolders:
for type_of_tweets in ["GEN-CT-Tweets", "COV-CT-Tweets", "NON-COV-CT-Tweets"]:
    
    if not os.path.exists(MERGED_TWEET_CHUNK_SAVE_PATH + type_of_tweets + '/'):
        os.makedirs(MERGED_TWEET_CHUNK_SAVE_PATH + type_of_tweets + '/')

In [None]:
def get_files_in_folder(folder):
    
    return [f for f in listdir(folder) if isfile(join(folder, f))]

In [None]:
users = get_files_in_folder(CLEAN_TWEETS_READ_PATH)

In [None]:
type_to_label_dict = {"GEN-CT-Tweets":'CT Tweet (Dummy)',
                      "COV-CT-Tweets":'COVID-SPECIFIC CT Tweet (Dummy)',
                      "NON-COV-CT-Tweets": 'NON-COVID CT Tweet (Dummy)'}
               
def read_and_process_df(USER_ID, READ_PATH, type_of_tweet):
    df = pd.read_csv(READ_PATH)
    
    df['user_id'] = USER_ID
    
    df = df.rename(columns={'Cleaned Tweet':'sentence1', type_to_label_dict[type_of_tweet]:'label'})
    
    return df

In [None]:
NUM_CHUNKS = 20

user_chunk_splits = np.array_split(users, NUM_CHUNKS)

for type_of_tweets in ["GEN-CT-Tweets", "COV-CT-Tweets", "NON-COV-CT-Tweets"]:

    for i, chunk in enumerate(range(0,NUM_CHUNKS)):

        chunk_read_paths = [CLEAN_TWEETS_READ_PATH + type_of_tweets + '/' + user + "CLEANED_TWEETS.csv" for user in user_chunk_splits[i]]

        users_and_paths = zip(user_chunk_splits[i], chunk_read_paths)
        
        list_of_user_tweet_dfs = [read_and_process_df(USER_ID=user_path_tup[0],
                                                    READ_PATH=user_path_tup[1],
                                                     type_of_tweet=type_of_tweets) for user_path_tup in users_and_paths]

        pd.concat(list_of_user_tweet_dfs).to_csv(fr"{MERGED_TWEET_CHUNK_SAVE_PATH}{type_of_tweets}/MERGED_CHUNK_{i}", index=False)

In [12]:
pd.read_html("https://lichess.org/player/top/200/bullet")

[       0                        1     2      3
 0      1         GM RebeccaHarris  3121  17.00
 1      2        GM nihalsarin2004  3077  24.00
 2      3            GM C9C9C9C9C9  3067  27.00
 3      4               IM mutdpro  3021  30.00
 4      5            GM chessbrahs  3010   3.00
 5      6           GM Federicov93  3003   5.00
 6      7              GM muisback  3002   1.00
 7      8                GM Arka50  3002   9.00
 8      9       GM Zhigalko_Sergei  3000  26.00
 9     10                 Shprot86  2982  32.00
 10    11               GM AbasovN  2962   7.00
 11    12          GM Night-King96  2958  25.00
 12    13          IM MatthewG-p4p  2955  20.00
 13    14             GM Eduiturri  2952  22.00
 14    15              IM Mitrabha  2948  11.00
 15    16          IM CheeseMuffin  2948  39.00
 16    17                  Crecent  2947  35.00
 17    18               IM toivok3  2945  14.00
 18    19                GM GMadly  2936  22.00
 19    20              NM Jasugi99  2924