In [1]:
import os
import re
from pathlib import Path
from shutil import copyfile
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
csv_folder_name = 'csv_data_hate'
total_samples = 2000

In [3]:
df_count = pd.DataFrame(columns=['topic', 'count', 'proportion_count'])

csv_dfs = {}
total_count = 0

entries = os.listdir(csv_folder_name)
# regex = re.compile('([a-zA-Z0-9_-]*)_final\.csv')
regex = re.compile('^(?!politics)([a-zA-Z0-9_-]*)_final\.csv')

for filename in entries:
    if filename.startswith('.'): continue
    matched = regex.search(filename)
    if matched:
        topic_name = matched.group(1)
        csv_path = os.path.join(csv_folder_name, filename)
        csv_dfs[topic_name] = pd.read_csv(csv_path, index_col=0)
        row_count = len(csv_dfs[topic_name].index)
        total_count += row_count
        df_count.loc[len(df_count)] = [topic_name, row_count, -1]
        print('{:<20}{:<7}'.format(topic_name, row_count))


immigration_2021-04 186    
immigration_2020-02 183    
mask_04             16     
immigration_2021-02 39     
mask_08             91     
mask_2021-02        53     
immigration_12      52     
vaccine_12          228    
asianhate_12        399    
ageism_12           537    
mask_2021-04        117    
mask_2020-02        2      
immigration_08      144    
vaccine_08          436    
ageism_2021-02      237    
mask_12             63     
asianhate_08        815    
ageism_08           709    
ageism_2020-02      117    
ageism_2021-04      691    
vaccine_2020-02     94     
vaccine_2021-04     166    
asianhate_2021-02   285    
vaccine_04          475    
immigration_04      93     
vaccine_2021-02     112    
ageism_04           1600   
asianhate_2021-04   323    
asianhate_2020-02   538    
asianhate_04        2887   


In [4]:
df_count['minimum'] = df_count['count'].apply(lambda x: x if x <= 20 else 20)
df_count['maximum'] = df_count['count'].apply(lambda x: x if x <= 800 else 800)
df_count['subtract'] = df_count['maximum'] - df_count['minimum']
min_sum = df_count['minimum'].sum()
max_sum = df_count['maximum'].sum()
df_count['proportion_count'] = df_count['subtract'].apply(lambda x: round((x/(max_sum - min_sum))*(total_samples - min_sum)))
df_count['proportion_count'] += df_count['minimum']

print(df_count['proportion_count'].sum())

if df_count['proportion_count'].sum() > total_samples:
    diff = df_count['proportion_count'].sum() - total_samples
    df_count.at[df_count.proportion_count.idxmax(), 'proportion_count'] -= diff
elif df_count['proportion_count'].sum() < total_samples:
    diff = total_samples - df_count['proportion_count'].sum()
    df_count.at[df_count.proportion_count.idxmax(), 'proportion_count'] += diff

df_count

2000


Unnamed: 0,topic,count,proportion_count,minimum,maximum,subtract
0,immigration_2021-04,186,49,20,186,166
1,immigration_2020-02,183,48,20,183,163
2,mask_04,16,16,16,16,0
3,immigration_2021-02,39,23,20,39,19
4,mask_08,91,32,20,91,71
5,mask_2021-02,53,26,20,53,33
6,immigration_12,52,26,20,52,32
7,vaccine_12,228,56,20,228,208
8,asianhate_12,399,86,20,399,379
9,ageism_12,537,110,20,537,517


In [None]:
# df_count['proportion_count'] = df_count['count'].apply(lambda x: round((x/total_count)*total_samples))
# df_count['proportion_count'] = round(total_samples / len(df_count.index))
# df_count.proportion_count[df_count.topic=='mask_08'] += 1
# df_count

In [6]:
df_count.to_csv(csv_folder_name + '/sample_count_no_politics.csv', index=False)
df_count['proportion_count'].sum()

2000

In [7]:
df_samples = pd.DataFrame(columns=['topic', 'hashtag', 'tweet_id', 'image_path', 'sample_path', 'body_text', 'image_text'])

for _, row in df_count.iterrows():
    topic_name = row['topic']
    df_topic = csv_dfs[topic_name].sample(n=row['proportion_count'], random_state=41)
    df_topic.insert(loc=0, column='topic', value=topic_name)
    df_topic.insert(loc=4, column='sample_path', value=np.nan)
    df_topic.insert(loc=len(df_topic.columns), column='image_text', value=np.nan)
    df_samples = df_samples.append(df_topic)

df_samples = df_samples.sample(frac=1, random_state=41).reset_index(drop=True)
df_samples.to_csv(csv_folder_name + '/sample_data_no_politics.csv', index=False)
df_samples

Unnamed: 0,topic,hashtag,tweet_id,image_path,sample_path,body_text,image_text,text_with_OCR
0,asianhate_08,CCPVirus,1293828811113390081,./tweet_data/asianhate_08/CCPVirus/tweets2/EfS...,,@thouse_opinions @CGTNOfficial @SecAzar You re...,,ood ttt ei individual ii ll pt ph ly
1,ageism_04,covidiots,1251633266861330432,./tweet_data/ageism_04/covidiots/tweets3/EV6x0...,,"@MattSmithKIRO7 @PinkyD124 Sorry, you don't ge...",,lea sada dow nol ad yy ee uncle vr ie iy vir
2,asianhate_08,ChinaVirus,1292632818334306305,./tweet_data/asianhate_08/ChinaVirus/tweets2/E...,,@Dbargen I’m not getting a #ChinaVirus chip-v...,,americans 1944 face certain death poy tes amer...
3,ageism_12,covidiots,1338782933251989505,./tweet_data/ageism_12/covidiots/tweets3/EpRP-...,,@ABC #ByeDon #ByeByeTrump #TrumpIsACompleteFai...,,trump puppet ae oe
4,mask_2020-02,maskoff,1231928057260277761,./tweet_data/mask_2020-02/maskoff/tweets4/ERiv...,,Tune In to space 90.1FM to join Mask Off with ...,,mask present man crush monday guest ceo signat...
...,...,...,...,...,...,...,...,...
1995,asianhate_12,BoycottChina,1339094335392923649,./tweet_data/asianhate_12/BoycottChina/tweets3...,,@RepWexton @senatemajldr The world please #Sta...,,xi heheh gon na destroy world hunt uyghur musl...
1996,ageism_2021-04,covidiots,1386741276750581761,./tweet_data/ageism_2021-04/covidiots/tweets4/...,,"Hey #England, you might want to admit you fcuk...",,wrong man wrong job wrong time
1997,asianhate_04,ChinaVirus,1247927837480419331,./tweet_data/asianhate_04/ChinaVirus/tweets2/E...,,@whoatack @realDonaldTrump So it wasn't #China...,,biden backs trump travel han atte call xenopho...
1998,ageism_08,covidiots,1298072694836125697,./tweet_data/ageism_08/covidiots/tweets4/EgOuP...,,Sound up! \n\nCOVID Status in these countries....,,12 month month month month


# Please only run cells below

## Copy selected samples and generate sample path

In [1]:
import os
from pathlib import Path
from shutil import copyfile
import pandas as pd
import numpy as np

In [3]:
csv_folder_name = 'csv_data_hate'
df_samples = pd.read_csv(csv_folder_name + '/sample_data_no_politics.csv')
annot_data_path = './annot_2000_no_politics'
Path(annot_data_path).mkdir(exist_ok=True)

In [4]:
def copy_image_set_path(img_path):
    image_name = img_path.split('/')[-1]
    dst_path = os.path.join(annot_data_path, image_name)
    copyfile(img_path, dst_path)
    return dst_path

In [5]:
df_samples['sample_path'] = df_samples['image_path'].apply(copy_image_set_path)
df_samples

Unnamed: 0,topic,hashtag,tweet_id,image_path,sample_path,body_text,image_text,text_with_OCR
0,asianhate_08,CCPVirus,1293828811113390081,./tweet_data/asianhate_08/CCPVirus/tweets2/EfS...,./annot_2000_no_politics/EfSackBUYAAiGXv.jpg,@thouse_opinions @CGTNOfficial @SecAzar You re...,,ood ttt ei individual ii ll pt ph ly
1,ageism_04,covidiots,1251633266861330432,./tweet_data/ageism_04/covidiots/tweets3/EV6x0...,./annot_2000_no_politics/EV6x0GLU4AA2Km2.jpg,"@MattSmithKIRO7 @PinkyD124 Sorry, you don't ge...",,lea sada dow nol ad yy ee uncle vr ie iy vir
2,asianhate_08,ChinaVirus,1292632818334306305,./tweet_data/asianhate_08/ChinaVirus/tweets2/E...,./annot_2000_no_politics/EfBar8xVAAAafL6.jpg,@Dbargen I’m not getting a #ChinaVirus chip-v...,,americans 1944 face certain death poy tes amer...
3,ageism_12,covidiots,1338782933251989505,./tweet_data/ageism_12/covidiots/tweets3/EpRP-...,./annot_2000_no_politics/EpRP-tCXMAUzZnn.jpg,@ABC #ByeDon #ByeByeTrump #TrumpIsACompleteFai...,,trump puppet ae oe
4,mask_2020-02,maskoff,1231928057260277761,./tweet_data/mask_2020-02/maskoff/tweets4/ERiv...,./annot_2000_no_politics/ERiv_fDW4AAVB0x.jpg,Tune In to space 90.1FM to join Mask Off with ...,,mask present man crush monday guest ceo signat...
...,...,...,...,...,...,...,...,...
1995,asianhate_12,BoycottChina,1339094335392923649,./tweet_data/asianhate_12/BoycottChina/tweets3...,./annot_2000_no_politics/EpVrM1uVoAMwWRa.jpg,@RepWexton @senatemajldr The world please #Sta...,,xi heheh gon na destroy world hunt uyghur musl...
1996,ageism_2021-04,covidiots,1386741276750581761,./tweet_data/ageism_2021-04/covidiots/tweets4/...,./annot_2000_no_politics/Ez6x142WEAI5tbs.jpg,"Hey #England, you might want to admit you fcuk...",,wrong man wrong job wrong time
1997,asianhate_04,ChinaVirus,1247927837480419331,./tweet_data/asianhate_04/ChinaVirus/tweets2/E...,./annot_2000_no_politics/EVGGcGlWsAk0ZMJ.jpg,@whoatack @realDonaldTrump So it wasn't #China...,,biden backs trump travel han atte call xenopho...
1998,ageism_08,covidiots,1298072694836125697,./tweet_data/ageism_08/covidiots/tweets4/EgOuP...,./annot_2000_no_politics/EgOuPBLWsAYxDiA.jpg,Sound up! \n\nCOVID Status in these countries....,,12 month month month month


In [6]:
df_samples.to_csv(csv_folder_name + '/sample_data_no_politics.csv', index=False)

### Old code below, don't run

In [3]:
def get_sample_path_and_copy(df, topic, annot_data_path):
    # if the sample_path already exists
    if not pd.isna(df['sample_path']):
        return df['sample_path']

    # if topic is not the selected topic
    if pd.isna(df['sample_path']) and df['topic'] != topic:
        return np.nan
    
    image_name = df['image_path'].split('/')[-1]
    dst_path = os.path.join(annot_data_path, image_name)
    copyfile(df['image_path'], dst_path)
    return dst_path

In [4]:
sample_folder = './annotation_data'
Path(sample_folder).mkdir(exist_ok=True)

for _, row in df_count.iterrows():
    topic_name = row['topic']
    topic_data_path = os.path.join('./', 'data_' + topic_name)
    annot_data_path = os.path.join(sample_folder, topic_name)
    # if data_{topic_name} folder doesn't exist
    if not Path(topic_data_path).is_dir():
        print('{} not exists; pass this topic'.format(topic_data_path))
        continue

    # if the sampled data for this topic already exists
    if Path(annot_data_path).is_dir():
        print('{} already exist; pass this topic'.format(annot_data_path))
        continue

    Path(annot_data_path).mkdir()

    df_samples['sample_path'] = df_samples.apply(get_sample_path_and_copy, axis=1, topic=topic_name, annot_data_path=annot_data_path)

    print('Topic \"{}\" processed'.format(topic_name))

Topic "Vaccine" processed
./data_AsianHate not exists; pass this topic
./data_Boomer not exists; pass this topic
./data_Political not exists; pass this topic
./data_Mask not exists; pass this topic
Topic "Immigration" processed


In [5]:
df_samples.to_csv('./csv_data/sample_data.csv', index=False)
df_samples

Unnamed: 0,topic,hashtag,tweet_id,image_path,sample_path,body_text,image_text,text_with_OCR
0,Immigration,refugees,1297606828461883394,./data_Immigration/refugees/tweets4/EgIGg6aXsA...,./annotation_data/Immigration/EgIGg6aXsAAX5-Y.jpg,Just one week left to get one of these hand em...,,oe
1,Mask,maskup,1297309098669510656,./data_Mask/maskup/tweets4/EgD3vYTUcAE_JMC.jpg,,🚧 #SFSO keeping our citizens updated on evacua...,,er ha an ey st ge at Bra wy yo ae ae ay at oes...
2,Boomer,caresact,1291464418715459589,./data_Boomer/caresact/tweets1/Eew0B6PWAAYZPCX...,,"Arguably, this one segment from the #CARESAct ...",,aS is he
3,Mask,NoMasks,1290561571312328706,./data_Mask/NoMasks/tweets1/Eej-523WsAAJa6h.jpg,,@RupertaMargate @FrancaisFarage @John35542712 ...,,Ser ah
4,Immigration,cdnpoli,1297693936782213123,./data_Immigration/cdnpoli/tweets4/EgJVvpkUMAA...,./annotation_data/Immigration/EgJVvpkUMAALB8Z.png,"You beat me to it, lol .. #cpc #cpcldr #cdnpol...",,You organize two car funeral procession iS
...,...,...,...,...,...,...,...,...
2395,Mask,NoMasks,1291460379638484994,./data_Mask/NoMasks/tweets1/EewwXv5WkAEb91v.jpg,,What's this...a new wave of virus???\n\n#KBF #...,,Let me introduce you to our next problem Seen ...
2396,Boomer,trumpliesamericansdie,1290437319611342851,./data_Boomer/trumpliesamericansdie/tweets1/Ee...,,@realDonaldTrump 1st-To be so proud of a large...,,reed la HI WA ha ant aa Trump to draw out at l...
2397,Boomer,caresact,1293904775055052801,./data_Boomer/caresact/tweets2/EfTfiMrWsAA35Y4...,,Ready to re-open? Use your #CARESAct Funding t...,,YOUR GO HERE Oe WEVE GOT vol
2398,Political,Trump2020,1291952240534728704,./data_Political/Trump2020/tweets2/Ee3vtmRX0AA...,,@HowDoYouMakeAU2 @Jenny06980536 @TerriSm600279...,,al LEAVE NEVER LEAVE
