In [1]:
# specify the topic name here
topic_name = 'Vaccine'

# entry folder for the data
# should be the form of 'data_{topic_name}/'
entry_folder = 'data_Vaccine/'

In [2]:
import os
import glob
import pandas as pd
from tqdm import tqdm

pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
def get_image_path(image_url, csv_path):
    image_name = image_url.split('/')[-1]
    image_path = './' + csv_path[:-4] + '/' + image_name
    return image_path

In [4]:
df_csv = pd.DataFrame(columns=['hashtag', 'tweet_id', 'image_path', 'body_text'])
df_csv['tweet_id'] = df_csv['tweet_id'].astype('int64')

entries = os.listdir(entry_folder)
for hashtag in entries:
    if hashtag.startswith('.'): continue
    list_files = glob.glob(entry_folder + hashtag + '/*.csv')
    tweets_count = 0
    for csv_path in list_files:
        # print(csv_path)
        temp_df = pd.read_csv(csv_path)
        tweets_count += len(temp_df.index)
        temp_df.drop(columns=['tweet_url'], inplace=True)
        temp_df.rename({'text_data': 'body_text', 'media': 'image_path'}, axis='columns', inplace=True)
        temp_df.insert(loc=0, column='hashtag', value=hashtag)
        cols = temp_df.columns.tolist()
        # before: cols = ['hashtag', 'image_path', 'body_text', 'tweet_id']
        cols = cols[:1] + cols[-1:] + cols[1:-1]
        temp_df = temp_df[cols]
        temp_df['image_path'] = temp_df['image_path'].apply(get_image_path, csv_path=csv_path)
        temp_df['tweet_id'] = temp_df['tweet_id'].astype('int64')
        df_csv = df_csv.append(temp_df)
    print('{:>24s}: {:>5}'.format(hashtag, tweets_count))
print('Topic: {} - Total tweets: {:>5}'.format(topic_name, len(df_csv.index)))

   BillGatesBioTerrorist:   160
 covidvaccinesideeffects:     0
       vaccinesSaveLives:   133
                Convid19:    38
            covidvaccine:  1033
      CoronavirusVaccine:  1793
     FauciLiedPeopleDied:    80
             antivaxxers:    51
             akeUpWiseUp:     0
            VaccinesWork:   806
               CovidHoax:   844
           scamdemic2020:   206
            depopulation:    75
           GetVaccinated:    82
               FireFauci:   203
           CovidVaxFacts:     0
          COVID19Vaccine:   597
           FauciTheFraud:   303
         billgatesofhell:     8
          VaccineFreedom:    17
Topic: Vaccine - Total tweets:  6429


In [5]:
# remove tweets with invalid image path
df_csv['path_is_valid'] = df_csv['image_path'].apply(lambda x: 1 if os.path.isfile(x) else 0)
df_csv = df_csv[df_csv.path_is_valid.eq(1)]
df_csv.drop(columns=['path_is_valid'], inplace=True)

In [6]:
df_csv.reset_index(drop=True, inplace=True)
df_csv

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,BillGatesBioTerrorist,1290588067615649794,./data_Vaccine/BillGatesBioTerrorist/tweets1/E...,Oh Billy boy.. #BillGates #BillGatesIsNotADoct...
1,BillGatesBioTerrorist,1290517935786807297,./data_Vaccine/BillGatesBioTerrorist/tweets1/E...,#Hydroxychloroquine GOOD ..... #BillGates BAD!...
2,BillGatesBioTerrorist,1290606587581800454,./data_Vaccine/BillGatesBioTerrorist/tweets1/E...,Kill Bill has locked his comments aaaaahahahah...
3,BillGatesBioTerrorist,1290251387201138688,./data_Vaccine/BillGatesBioTerrorist/tweets1/E...,People is Berlin wokeup #plandemic #coronaviru...
4,BillGatesBioTerrorist,1290097574393741313,./data_Vaccine/BillGatesBioTerrorist/tweets1/E...,"DON'T LIKE WEARING A MASK? GOGGLES, FACE SHIEL..."
...,...,...,...,...
6423,VaccineFreedom,1298757369103609856,./data_Vaccine/VaccineFreedom/tweets4/EgYc8d2X...,Vaccines Rely on Horseshoe Crab’s Blue Blood f...
6424,VaccineFreedom,1300176748223569922,./data_Vaccine/VaccineFreedom/tweets5/Egsn3M1W...,"Pharmacists, Pharmacy Interns Given Green Ligh..."
6425,VaccineFreedom,1299904802902736902,./data_Vaccine/VaccineFreedom/tweets5/Egowh4PX...,Flu Vaccination Associated With Increased Vira...
6426,VaccineFreedom,1299874675951104001,./data_Vaccine/VaccineFreedom/tweets5/EgoVISoX...,"Pharmacists, Pharmacy Interns Given Green Ligh..."


### Filter out images without text

In [7]:
from opencv_text_detector import TextDetector

tqdm.pandas()
detector = TextDetector()

# df_csv['has_text'] = df_csv['image_path'].apply(detector.detect_text)
df_csv['has_text'] = df_csv['image_path'].progress_apply(detector.detect_text)

df_csv

  0%|          | 2/6428 [00:00<29:37,  3.62it/s]1 - ./data_Vaccine/BillGatesBioTerrorist/tweets1/EekXAVSXkAEz5uv.jpg
  0%|          | 3/6428 [00:00<28:07,  3.81it/s]1 - ./data_Vaccine/BillGatesBioTerrorist/tweets1/EejWZ10WoAEtdlx.jpg
  0%|          | 5/6428 [00:01<26:54,  3.98it/s]1 - ./data_Vaccine/BillGatesBioTerrorist/tweets1/EeknvbuWsAM3R4h.jpg
0 - ./data_Vaccine/BillGatesBioTerrorist/tweets1/Eefkyf4X0AA_toe.jpg
  0%|          | 6/6428 [00:01<27:52,  3.84it/s]1 - ./data_Vaccine/BillGatesBioTerrorist/tweets1/EedYpoGWAAU1dmE.jpg
  0%|          | 7/6428 [00:01<26:09,  4.09it/s]1 - ./data_Vaccine/BillGatesBioTerrorist/tweets1/EegR2RlXoAY1aXA.jpg
  0%|          | 8/6428 [00:02<24:55,  4.29it/s]1 - ./data_Vaccine/BillGatesBioTerrorist/tweets1/EegLZ5kWsAAkKej.jpg
  0%|          | 10/6428 [00:02<23:21,  4.58it/s]0 - ./data_Vaccine/BillGatesBioTerrorist/tweets1/Eed7kkGXsAAF8-9.jpg
0 - ./data_Vaccine/BillGatesBioTerrorist/tweets1/Eeu4Q-hXsAAL2Gl.jpg
  0%|          | 11/6428 [00:02<23:49,  4.

KeyboardInterrupt: 

In [8]:
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv.to_csv(result_path)

In [15]:
# filter out those images without text
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv = pd.read_csv(result_path, index_col=0)

df_csv_cleaned = df_csv[df_csv.has_text.eq(1)]
df_csv_cleaned.drop(columns=['has_text'], inplace=True)
df_csv_cleaned.reset_index(drop=True, inplace=True)

result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

In [16]:
df_csv_cleaned

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,vaccinesSaveLives,1300191578649235712,./data_Vaccine/vaccinesSaveLives/tweets5/Egs1W...,@CDCgov #vaccine patents – Robert F Kennedy Jr...
1,vaccinesSaveLives,1300184057733615616,./data_Vaccine/vaccinesSaveLives/tweets5/Egsug...,@CDCgov #vaccine patents – Robert F Kennedy Jr...
2,vaccinesSaveLives,1299603371947483136,./data_Vaccine/vaccinesSaveLives/tweets5/EgkeT...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...
3,vaccinesSaveLives,1299937036640501760,./data_Vaccine/vaccinesSaveLives/tweets5/EgpN0...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...
4,vaccinesSaveLives,1291285670737084416,./data_Vaccine/vaccinesSaveLives/tweets1/EeuRd...,"In love 😍\nThank you, @DrJennersHouse ❤️\n\n#E..."
...,...,...,...,...
8124,VaccinesWork,1292070329066586112,./data_Vaccine/VaccinesWork/tweets2/Ee5bGryXkA...,Voor t eerst in m'n leven iemand geblocked. Da...
8125,VaccinesWork,1292162361374019584,./data_Vaccine/VaccinesWork/tweets2/Ee6tOA0U8A...,"First,\n\nPlease continue COVID precautions - ..."
8126,VaccinesWork,1292145946197929984,./data_Vaccine/VaccinesWork/tweets2/Ee6f4_1WoA...,Still trust Leigh-Allyn Baker? Do some researc...
8127,VaccinesWork,1292161343336910848,./data_Vaccine/VaccinesWork/tweets2/Ee6trl1WoA...,#JoJorgensen2020 #LetHerSpeak #VaccinesWork ht...


### Count the tweets in different hashtags after cleaning

In [3]:
result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
df_csv_cleaned = pd.read_csv(result_cleaned_path, index_col=0)

In [14]:
# count the tweets in different hashtags after cleaning
df_csv_cleaned['hashtag'].value_counts()

CoronavirusVaccine       2172
CovidHoax                1374
covidvaccine             1209
VaccinesWork              843
COVID19Vaccine            645
FauciTheFraud             437
scamdemic2020             312
BillGatesBioTerrorist     301
FireFauci                 262
vaccinesSaveLives         124
FauciLiedPeopleDied       121
depopulation              102
GetVaccinated              74
Convid19                   61
antivaxxers                55
VaccineFreedom             24
billgatesofhell            13
Name: hashtag, dtype: int64