In [12]:
# specify the topic name here
topic_name = 'Vaccine'

# entry folder for the data
# should be the form of 'data_{topic_name}/'
entry_folder = 'data_Vaccine/'

In [3]:
import os
import glob
import pandas as pd
from tqdm import tqdm

pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
def get_image_path(image_url, csv_path):
    image_name = image_url.split('/')[-1]
    image_path = './' + csv_path[:-4] + '/' + image_name
    return image_path

In [4]:
df_csv = pd.DataFrame(columns=['hashtag', 'tweet_id', 'image_path', 'body_text'])
df_csv['tweet_id'] = df_csv['tweet_id'].astype('int64')

entries = os.listdir(entry_folder)
for hashtag in entries:
    if hashtag.startswith('.'): continue
    list_files = glob.glob(entry_folder + hashtag + '/*.csv')
    tweets_count = 0
    for csv_path in list_files:
        # print(csv_path)
        temp_df = pd.read_csv(csv_path)
        tweets_count += len(temp_df.index)
        temp_df.drop(columns=['tweet_url'], inplace=True)
        temp_df.rename({'text_data': 'body_text', 'media': 'image_path'}, axis='columns', inplace=True)
        temp_df.insert(loc=0, column='hashtag', value=hashtag)
        cols = temp_df.columns.tolist()
        # before: cols = ['hashtag', 'image_path', 'body_text', 'tweet_id']
        cols = cols[:1] + cols[-1:] + cols[1:-1]
        temp_df = temp_df[cols]
        temp_df['image_path'] = temp_df['image_path'].apply(get_image_path, csv_path=csv_path)
        temp_df['tweet_id'] = temp_df['tweet_id'].astype('int64')
        df_csv = df_csv.append(temp_df)
    print('{:>24s}: {:>5}'.format(hashtag, tweets_count))
print('Topic: {} - Total tweets: {:>5}'.format(topic_name, len(df_csv.index)))

       vaccinesSaveLives:   147
                Convid19:    71
          VaccineFreedom:    32
      CoronavirusVaccine:  2721
   BillGatesBioTerrorist:   325
            covidvaccine:  1440
           FauciTheFraud:   493
         billgatesofhell:    15
          COVID19Vaccine:   810
           CovidVaxFacts:     0
           GetVaccinated:    81
             antivaxxers:    61
           scamdemic2020:   348
               CovidHoax:  1517
            depopulation:   111
               FireFauci:   282
     FauciLiedPeopleDied:   126
 covidvaccinesideeffects:     0
            VaccinesWork:  1016
            WakeUpWiseUp:     0
Topic: Vaccine - Total tweets:  9596


In [5]:
# remove tweets with invalid image path
df_csv['path_is_valid'] = df_csv['image_path'].apply(lambda x: 1 if os.path.isfile(x) else 0)
df_csv = df_csv[df_csv.path_is_valid.eq(1)]
df_csv.drop(columns=['path_is_valid'], inplace=True)

In [6]:
df_csv.reset_index(drop=True, inplace=True)
df_csv

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,vaccinesSaveLives,1300191578649235712,./data_Vaccine/vaccinesSaveLives/tweets5/Egs1W...,@CDCgov #vaccine patents – Robert F Kennedy Jr...
1,vaccinesSaveLives,1300168923221032960,./data_Vaccine/vaccinesSaveLives/tweets5/Egsgv...,#CDC #vaccine patents – Robert F Kennedy Jr ge...
2,vaccinesSaveLives,1300184057733615616,./data_Vaccine/vaccinesSaveLives/tweets5/Egsug...,@CDCgov #vaccine patents – Robert F Kennedy Jr...
3,vaccinesSaveLives,1299603371947483136,./data_Vaccine/vaccinesSaveLives/tweets5/EgkeT...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...
4,vaccinesSaveLives,1299937036640501760,./data_Vaccine/vaccinesSaveLives/tweets5/EgpN0...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...
...,...,...,...,...
9590,VaccinesWork,1292145946197929984,./data_Vaccine/VaccinesWork/tweets2/Ee6f4_1WoA...,Still trust Leigh-Allyn Baker? Do some researc...
9591,VaccinesWork,1292161343336910848,./data_Vaccine/VaccinesWork/tweets2/Ee6trl1WoA...,#JoJorgensen2020 #LetHerSpeak #VaccinesWork ht...
9592,VaccinesWork,1292185263263879168,./data_Vaccine/VaccinesWork/tweets2/Ee7DpOTWoA...,@alimkakeng @DukeHealth @dukeemergency @snaggi...
9593,VaccinesWork,1292300869132857088,./data_Vaccine/VaccinesWork/tweets2/Ee8syjEWkA...,#VaccinesWork this made me lol https://t.co/r...


### Filter out images without text

In [7]:
from opencv_text_detector import TextDetector

tqdm.pandas()
detector = TextDetector()

# df_csv['has_text'] = df_csv['image_path'].apply(detector.detect_text)
df_csv['has_text'] = df_csv['image_path'].progress_apply(detector.detect_text)

df_csv

rY0UwAA8z8N.jpg
1 - ./data_Vaccine/VaccinesWork/tweets3/Efd8OAbWAAA4pFO.jpg
 98%|█████████▊| 9362/9595 [16:45<00:25,  9.18it/s]1 - ./data_Vaccine/VaccinesWork/tweets3/EfeiNJYXkAI9eE2.jpg
1 - ./data_Vaccine/VaccinesWork/tweets3/Efh1xtbWkAArJCZ.jpg
 98%|█████████▊| 9364/9595 [16:46<00:25,  8.92it/s]1 - ./data_Vaccine/VaccinesWork/tweets3/EfdkdrlXkAAtyhq.jpg
1 - ./data_Vaccine/VaccinesWork/tweets3/EfdQWzjUcAEWjW6.jpg
 98%|█████████▊| 9366/9595 [16:46<00:24,  9.18it/s]1 - ./data_Vaccine/VaccinesWork/tweets3/Efa4g26X0AYMu6H.jpg
1 - ./data_Vaccine/VaccinesWork/tweets3/EfgLl0vWsAcik3V.jpg
 98%|█████████▊| 9368/9595 [16:46<00:25,  9.02it/s]1 - ./data_Vaccine/VaccinesWork/tweets3/EfbItCqUwAA3e4z.jpg
1 - ./data_Vaccine/VaccinesWork/tweets3/EfUjwBRVoAYNaR2.jpg
 98%|█████████▊| 9370/9595 [16:46<00:24,  9.01it/s]1 - ./data_Vaccine/VaccinesWork/tweets3/EfggG9GUMAI3VcG.png
1 - ./data_Vaccine/VaccinesWork/tweets3/EfcWGXkXgAA2KPg.jpg
 98%|█████████▊| 9372/9595 [16:46<00:24,  9.02it/s]1 - ./data_Vaccine

Unnamed: 0,hashtag,tweet_id,image_path,body_text,has_text
0,vaccinesSaveLives,1300191578649235712,./data_Vaccine/vaccinesSaveLives/tweets5/Egs1W...,@CDCgov #vaccine patents – Robert F Kennedy Jr...,1
1,vaccinesSaveLives,1300168923221032960,./data_Vaccine/vaccinesSaveLives/tweets5/Egsgv...,#CDC #vaccine patents – Robert F Kennedy Jr ge...,0
2,vaccinesSaveLives,1300184057733615616,./data_Vaccine/vaccinesSaveLives/tweets5/Egsug...,@CDCgov #vaccine patents – Robert F Kennedy Jr...,1
3,vaccinesSaveLives,1299603371947483136,./data_Vaccine/vaccinesSaveLives/tweets5/EgkeT...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...,1
4,vaccinesSaveLives,1299937036640501760,./data_Vaccine/vaccinesSaveLives/tweets5/EgpN0...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...,1
...,...,...,...,...,...
9590,VaccinesWork,1292145946197929984,./data_Vaccine/VaccinesWork/tweets2/Ee6f4_1WoA...,Still trust Leigh-Allyn Baker? Do some researc...,1
9591,VaccinesWork,1292161343336910848,./data_Vaccine/VaccinesWork/tweets2/Ee6trl1WoA...,#JoJorgensen2020 #LetHerSpeak #VaccinesWork ht...,1
9592,VaccinesWork,1292185263263879168,./data_Vaccine/VaccinesWork/tweets2/Ee7DpOTWoA...,@alimkakeng @DukeHealth @dukeemergency @snaggi...,0
9593,VaccinesWork,1292300869132857088,./data_Vaccine/VaccinesWork/tweets2/Ee8syjEWkA...,#VaccinesWork this made me lol https://t.co/r...,1


In [8]:
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv.to_csv(result_path)

In [15]:
# filter out those images without text
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv = pd.read_csv(result_path, index_col=0)

df_csv_cleaned = df_csv[df_csv.has_text.eq(1)]
df_csv_cleaned.drop(columns=['has_text'], inplace=True)
df_csv_cleaned.reset_index(drop=True, inplace=True)

result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

In [16]:
df_csv_cleaned

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,vaccinesSaveLives,1300191578649235712,./data_Vaccine/vaccinesSaveLives/tweets5/Egs1W...,@CDCgov #vaccine patents – Robert F Kennedy Jr...
1,vaccinesSaveLives,1300184057733615616,./data_Vaccine/vaccinesSaveLives/tweets5/Egsug...,@CDCgov #vaccine patents – Robert F Kennedy Jr...
2,vaccinesSaveLives,1299603371947483136,./data_Vaccine/vaccinesSaveLives/tweets5/EgkeT...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...
3,vaccinesSaveLives,1299937036640501760,./data_Vaccine/vaccinesSaveLives/tweets5/EgpN0...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...
4,vaccinesSaveLives,1291285670737084416,./data_Vaccine/vaccinesSaveLives/tweets1/EeuRd...,"In love 😍\nThank you, @DrJennersHouse ❤️\n\n#E..."
...,...,...,...,...
8124,VaccinesWork,1292070329066586112,./data_Vaccine/VaccinesWork/tweets2/Ee5bGryXkA...,Voor t eerst in m'n leven iemand geblocked. Da...
8125,VaccinesWork,1292162361374019584,./data_Vaccine/VaccinesWork/tweets2/Ee6tOA0U8A...,"First,\n\nPlease continue COVID precautions - ..."
8126,VaccinesWork,1292145946197929984,./data_Vaccine/VaccinesWork/tweets2/Ee6f4_1WoA...,Still trust Leigh-Allyn Baker? Do some researc...
8127,VaccinesWork,1292161343336910848,./data_Vaccine/VaccinesWork/tweets2/Ee6trl1WoA...,#JoJorgensen2020 #LetHerSpeak #VaccinesWork ht...


### Count the tweets in different hashtags after cleaning

In [13]:
result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
df_csv_cleaned = pd.read_csv(result_cleaned_path, index_col=0)

In [14]:
# count the tweets in different hashtags after cleaning
df_csv_cleaned['hashtag'].value_counts()

CoronavirusVaccine       2172
CovidHoax                1374
covidvaccine             1209
VaccinesWork              843
COVID19Vaccine            645
FauciTheFraud             437
scamdemic2020             312
BillGatesBioTerrorist     301
FireFauci                 262
vaccinesSaveLives         124
FauciLiedPeopleDied       121
depopulation              102
GetVaccinated              74
Convid19                   61
antivaxxers                55
VaccineFreedom             24
billgatesofhell            13
Name: hashtag, dtype: int64