In [1]:
import os
import re
from pathlib import Path
from shutil import copyfile
import pandas as pd
import numpy as np

In [2]:
data_folder = './csv_data'
total_samples = 2400

In [3]:
df_count = pd.DataFrame(columns=['topic', 'count', 'proportion_count'])

csv_dfs = {}
total_count = 0

entries = os.listdir(data_folder)
regex = re.compile('data_([a-zA-Z]*)_nodups_wobj\.csv')

for filename in entries:
    if filename.startswith('.'): continue
    matched = regex.search(filename)
    if matched:
        topic_name = matched.group(1)
        csv_path = os.path.join(data_folder, filename)
        csv_dfs[topic_name] = pd.read_csv(csv_path, index_col=0)
        row_count = len(csv_dfs[topic_name].index)
        total_count += row_count
        df_count.loc[len(df_count)] = [topic_name, row_count, -1]
        print('{:<13}{:<7}'.format(topic_name, row_count))


Political    29663  
Immigration  6854   
AsianHate    1602   
Boomer       2184   
Vaccine      2671   
Mask         20149  


In [4]:
# df_count['proportion_count'] = df_count['count'].apply(lambda x: round((x/total_count)*total_samples))
df_count['proportion_count'] = round(total_samples / len(df_count.index))

In [5]:
df_count.to_csv('./csv_data/sample_count.csv', index=False)
df_count

Unnamed: 0,topic,count,proportion_count
0,Political,29663,400
1,Immigration,6854,400
2,AsianHate,1602,400
3,Boomer,2184,400
4,Vaccine,2671,400
5,Mask,20149,400


In [6]:
df_count['proportion_count'].sum()

2400

In [7]:
df_samples = pd.DataFrame(columns=['topic', 'hashtag', 'tweet_id', 'image_path', 'sample_path', 'body_text', 'image_text'])

for _, row in df_count.iterrows():
    topic_name = row['topic']
    df_topic = csv_dfs[topic_name].sample(n=row['proportion_count'], random_state=42)
    df_topic.insert(loc=0, column='topic', value=topic_name)
    df_topic.insert(loc=4, column='sample_path', value=np.nan)
    df_topic.insert(loc=len(df_topic.columns), column='image_text', value=np.nan)
    df_samples = df_samples.append(df_topic)

df_samples = df_samples.sample(frac=1, random_state=42).reset_index(drop=True)
df_samples.to_csv('./csv_data/sample_data.csv', index=False)
df_samples

Unnamed: 0,topic,hashtag,tweet_id,image_path,sample_path,body_text,image_text
0,Mask,masksoff,1290868734429560833,./data_Mask/masksoff/tweets1/EeoWRcMUcAkQCPv.jpg,,@realDonaldTrump Is this the correct way to we...,
1,Vaccine,CovidHoax,1298333011654533121,./data_Vaccine/CovidHoax/tweets4/EgSYL8-UEAAvm...,,#covidHOAX #PLANDEMIC \n\nWhy FORCE vaccines?\...,
2,AsianHate,ChinaVirus,1298416220341809153,./data_AsianHate/ChinaVirus/tweets4/EgTmq8KVoA...,,"In a “wartime state” of lockdown, residents in...",
3,Vaccine,COVID19Vaccine,1297957738069110784,./data_Vaccine/COVID19Vaccine/tweets4/EgNFrqDU...,,@briantylercohen @realDonaldTrump needs Russia...,
4,Mask,NoMasks,1295274000311099392,./data_Mask/NoMasks/tweets3/Efm8eAkWkAAdDdq.jpg,,@Uber ...you won't be getting my business from...,
...,...,...,...,...,...,...,...
2395,Vaccine,CovidHoax,1289652181981786114,./data_Vaccine/CovidHoax/tweets1/EeXD00OXsAAHT...,,As Unemployment Benefits End Today Trump Admin...,
2396,AsianHate,ChinaVirus,1299044347480928257,./data_AsianHate/ChinaVirus/tweets4/Egch8qDVAA...,,@TheBrandonMorse @shiroihamusan Told you 😀 \n\...,
2397,AsianHate,CCPVirus,1290292534346883077,./data_AsianHate/CCPVirus/tweets1/EegKN1sXsAEv...,,@realDonaldTrump Chinese communist party won’t...,
2398,Boomer,trumpliesamericansdie,1299009244696645632,./data_Boomer/trumpliesamericansdie/tweets4/Eg...,,.@vp Pence should really stop plagiarizing the...,


# Please only run cells below

## Copy selected samples and generate sample path

### Before running this part, please check that the topics you want to process is NOT in the annotation_data folder

In [8]:
import os
from pathlib import Path
from shutil import copyfile
import pandas as pd
import numpy as np

In [9]:
df_count = pd.read_csv('./csv_data/sample_count.csv')
df_samples = pd.read_csv('./csv_data/sample_data.csv')

In [10]:
def get_sample_path_and_copy(df, topic, annot_data_path):
    # if the sample_path already exists
    if not pd.isna(df['sample_path']):
        return df['sample_path']

    # if topic is not the selected topic
    if pd.isna(df['sample_path']) and df['topic'] != topic:
        return np.nan
    
    image_name = df['image_path'].split('/')[-1]
    dst_path = os.path.join(annot_data_path, image_name)
    copyfile(df['image_path'], dst_path)
    return dst_path

In [11]:
sample_folder = './annotation_data'
Path(sample_folder).mkdir(exist_ok=True)

for _, row in df_count.iterrows():
    topic_name = row['topic']
    topic_data_path = os.path.join('./', 'data_' + topic_name)
    annot_data_path = os.path.join(sample_folder, topic_name)
    # if data_{topic_name} folder doesn't exist
    if not Path(topic_data_path).is_dir():
        print('{} not exists; pass this topic'.format(topic_data_path))
        continue

    # if the sampled data for this topic already exists
    if Path(annot_data_path).is_dir():
        print('{} already exist; pass this topic'.format(annot_data_path))
        continue

    Path(annot_data_path).mkdir()

    df_samples['sample_path'] = df_samples.apply(get_sample_path_and_copy, axis=1, topic=topic_name, annot_data_path=annot_data_path)

    print('Topic \"{}\" processed'.format(topic_name))

./data_Political not exists; pass this topic
Topic "Immigration" processed
./data_AsianHate not exists; pass this topic
./data_Boomer not exists; pass this topic
Topic "Vaccine" processed
./data_Mask not exists; pass this topic


In [12]:
df_samples.to_csv('./csv_data/sample_data.csv', index=False)
df_samples

Unnamed: 0,topic,hashtag,tweet_id,image_path,sample_path,body_text,image_text
0,Mask,masksoff,1290868734429560833,./data_Mask/masksoff/tweets1/EeoWRcMUcAkQCPv.jpg,,@realDonaldTrump Is this the correct way to we...,
1,Vaccine,CovidHoax,1298333011654533121,./data_Vaccine/CovidHoax/tweets4/EgSYL8-UEAAvm...,./annotation_data/Vaccine/EgSYL8-UEAAvmqv.jpg,#covidHOAX #PLANDEMIC \n\nWhy FORCE vaccines?\...,
2,AsianHate,ChinaVirus,1298416220341809153,./data_AsianHate/ChinaVirus/tweets4/EgTmq8KVoA...,,"In a “wartime state” of lockdown, residents in...",
3,Vaccine,COVID19Vaccine,1297957738069110784,./data_Vaccine/COVID19Vaccine/tweets4/EgNFrqDU...,./annotation_data/Vaccine/EgNFrqDU4AAnVOF.jpg,@briantylercohen @realDonaldTrump needs Russia...,
4,Mask,NoMasks,1295274000311099392,./data_Mask/NoMasks/tweets3/Efm8eAkWkAAdDdq.jpg,,@Uber ...you won't be getting my business from...,
...,...,...,...,...,...,...,...
2395,Vaccine,CovidHoax,1289652181981786114,./data_Vaccine/CovidHoax/tweets1/EeXD00OXsAAHT...,./annotation_data/Vaccine/EeXD00OXsAAHTF9.jpg,As Unemployment Benefits End Today Trump Admin...,
2396,AsianHate,ChinaVirus,1299044347480928257,./data_AsianHate/ChinaVirus/tweets4/Egch8qDVAA...,,@TheBrandonMorse @shiroihamusan Told you 😀 \n\...,
2397,AsianHate,CCPVirus,1290292534346883077,./data_AsianHate/CCPVirus/tweets1/EegKN1sXsAEv...,,@realDonaldTrump Chinese communist party won’t...,
2398,Boomer,trumpliesamericansdie,1299009244696645632,./data_Boomer/trumpliesamericansdie/tweets4/Eg...,,.@vp Pence should really stop plagiarizing the...,
