In [1]:
# specify the topic name here
topic_name = 'Immigration'

# entry folder for the data
# should be the form of 'data_{topic_name}/'
entry_folder = 'data_Immigration/'

In [2]:
import os
import glob
import pandas as pd
from tqdm import tqdm

pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
def get_image_path(image_url, csv_path):
    image_name = image_url.split('/')[-1]
    image_path = './' + csv_path[:-4] + '/' + image_name
    return image_path

In [4]:
df_csv = pd.DataFrame(columns=['hashtag', 'tweet_id', 'image_path', 'body_text'])
df_csv['tweet_id'] = df_csv['tweet_id'].astype('int64')

entries = os.listdir(entry_folder)
for hashtag in entries:
    if hashtag.startswith('.'): continue
    list_files = glob.glob(entry_folder + hashtag + '/*.csv')
    tweets_count = 0
    for csv_path in list_files:
        # print(csv_path)
        temp_df = pd.read_csv(csv_path)
        tweets_count += len(temp_df.index)
        temp_df.drop(columns=['tweet_url'], inplace=True)
        temp_df.rename({'text_data': 'body_text', 'media': 'image_path'}, axis='columns', inplace=True)
        temp_df.insert(loc=0, column='hashtag', value=hashtag)
        cols = temp_df.columns.tolist()
        # before: cols = ['hashtag', 'image_path', 'body_text', 'tweet_id']
        cols = cols[:1] + cols[-1:] + cols[1:-1]
        temp_df = temp_df[cols]
        temp_df['image_path'] = temp_df['image_path'].apply(get_image_path, csv_path=csv_path)
        temp_df['tweet_id'] = temp_df['tweet_id'].astype('int64')
        df_csv = df_csv.append(temp_df)
    print('{:>24s}: {:>5}'.format(hashtag, tweets_count))
print('Topic: {} - Total tweets: {:>5}'.format(topic_name, len(df_csv.index)))

               bansharia:     2
               MuslimBan:    42
             immigration:  2811
                liberals:   716
                  deport:    17
             NoBanNoWall:     3
            americafirst:  1978
         Antiimmigration:    11
      refuggesnotwelcome:     0
                refugees:  1679
           illegalaliens:    61
            buildthewall:  1266
                banislam:    13
         stoptheinvasion:    18
        multiculturalism:    75
           deportthemall:     9
                 cdnpoli:  9986
Topic: Immigration - Total tweets: 18687


In [5]:
# remove tweets with invalid image path
df_csv['path_is_valid'] = df_csv['image_path'].apply(lambda x: 1 if os.path.isfile(x) else 0)
df_csv = df_csv[df_csv.path_is_valid.eq(1)]
df_csv.drop(columns=['path_is_valid'], inplace=True)

In [6]:
df_csv.reset_index(drop=True, inplace=True)
df_csv

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,vaccinesSaveLives,1300191578649235458,./data_Vaccine/vaccinesSaveLives/tweets5/Egs1W...,@CDCgov #vaccine patents – Robert F Kennedy Jr...
1,vaccinesSaveLives,1300168923221032963,./data_Vaccine/vaccinesSaveLives/tweets5/Egsgv...,#CDC #vaccine patents – Robert F Kennedy Jr ge...
2,vaccinesSaveLives,1300184057733615617,./data_Vaccine/vaccinesSaveLives/tweets5/Egsug...,@CDCgov #vaccine patents – Robert F Kennedy Jr...
3,vaccinesSaveLives,1299603371947483137,./data_Vaccine/vaccinesSaveLives/tweets5/EgkeT...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...
4,vaccinesSaveLives,1299937036640501761,./data_Vaccine/vaccinesSaveLives/tweets5/EgpN0...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...
...,...,...,...,...
6423,VaccinesWork,1292132465633632512,./data_Vaccine/VaccinesWork/tweets2/Ee6ToLoXkA...,Fewer babies and children in London are attend...
6424,VaccinesWork,1292097472270085888,./data_Vaccine/VaccinesWork/tweets2/Ee5zzTrWAA...,Keeping up to date with #vaccinations remains ...
6425,VaccinesWork,1291977701339869184,./data_Vaccine/VaccinesWork/tweets2/Ee4G32eWsA...,What is the difference between #meningitis and...
6426,VaccinesWork,1292115859826188288,./data_Vaccine/VaccinesWork/tweets2/Ee6EhwcXgA...,"#Philippines: 102,000 kids vaccinated vs polio..."


### Filter out images without text

In [7]:
from opencv_text_detector import TextDetector

tqdm.pandas()
detector = TextDetector()

# df_csv['has_text'] = df_csv['image_path'].apply(detector.detect_text)
df_csv['has_text'] = df_csv['image_path'].progress_apply(detector.detect_text)

df_csv

KDKXgAAB73S.jpg
1 - ./data_Vaccine/VaccinesWork/tweets3/EfsKBvvXkAA0iyR.jpg
 96%|█████████▋| 6194/6428 [10:47<00:26,  8.90it/s]1 - ./data_Vaccine/VaccinesWork/tweets3/Efs5hdjXoAMWzXW.jpg
1 - ./data_Vaccine/VaccinesWork/tweets3/EfuTHtHXYAMhpOS.jpg
 96%|█████████▋| 6198/6428 [10:48<00:25,  8.94it/s]0 - ./data_Vaccine/VaccinesWork/tweets3/Efpdy1YXYAAChnl.jpg
1 - ./data_Vaccine/VaccinesWork/tweets3/EfhH5NrWoAAP7FX.jpg
1 - ./data_Vaccine/VaccinesWork/tweets3/EfoMLrnWkAMJ4xt.png
 96%|█████████▋| 6200/6428 [10:48<00:25,  8.96it/s]1 - ./data_Vaccine/VaccinesWork/tweets3/EfimC2KX0AAGizb.png
1 - ./data_Vaccine/VaccinesWork/tweets3/EfiOxnrXoAIlER7.jpg
 96%|█████████▋| 6202/6428 [10:48<00:26,  8.61it/s]1 - ./data_Vaccine/VaccinesWork/tweets3/EfjTHd8XkAMvd9r.jpg
1 - ./data_Vaccine/VaccinesWork/tweets3/EfhYEKHX0AAHwJH.jpg
 96%|█████████▋| 6203/6428 [10:48<00:25,  8.73it/s]1 - ./data_Vaccine/VaccinesWork/tweets3/EfoxFL4WoAAuzAH.jpg
1 - ./data_Vaccine/VaccinesWork/tweets3/EfonbtJWsAAfuso.jpg
 97%|████

Unnamed: 0,hashtag,tweet_id,image_path,body_text,has_text
0,vaccinesSaveLives,1300191578649235458,./data_Vaccine/vaccinesSaveLives/tweets5/Egs1W...,@CDCgov #vaccine patents – Robert F Kennedy Jr...,1
1,vaccinesSaveLives,1300168923221032963,./data_Vaccine/vaccinesSaveLives/tweets5/Egsgv...,#CDC #vaccine patents – Robert F Kennedy Jr ge...,0
2,vaccinesSaveLives,1300184057733615617,./data_Vaccine/vaccinesSaveLives/tweets5/Egsug...,@CDCgov #vaccine patents – Robert F Kennedy Jr...,1
3,vaccinesSaveLives,1299603371947483137,./data_Vaccine/vaccinesSaveLives/tweets5/EgkeT...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...,1
4,vaccinesSaveLives,1299937036640501761,./data_Vaccine/vaccinesSaveLives/tweets5/EgpN0...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...,1
...,...,...,...,...,...
6423,VaccinesWork,1292132465633632512,./data_Vaccine/VaccinesWork/tweets2/Ee6ToLoXkA...,Fewer babies and children in London are attend...,1
6424,VaccinesWork,1292097472270085888,./data_Vaccine/VaccinesWork/tweets2/Ee5zzTrWAA...,Keeping up to date with #vaccinations remains ...,1
6425,VaccinesWork,1291977701339869184,./data_Vaccine/VaccinesWork/tweets2/Ee4G32eWsA...,What is the difference between #meningitis and...,0
6426,VaccinesWork,1292115859826188288,./data_Vaccine/VaccinesWork/tweets2/Ee6EhwcXgA...,"#Philippines: 102,000 kids vaccinated vs polio...",1


In [8]:
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv.to_csv(result_path)

In [9]:
# filter out those images without text
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv = pd.read_csv(result_path, index_col=0)

df_csv_cleaned = df_csv[df_csv.has_text.eq(1)]
df_csv_cleaned.drop(columns=['has_text'], inplace=True)
df_csv_cleaned.reset_index(drop=True, inplace=True)

result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

In [10]:
df_csv_cleaned

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,vaccinesSaveLives,1300191578649235458,./data_Vaccine/vaccinesSaveLives/tweets5/Egs1W...,@CDCgov #vaccine patents – Robert F Kennedy Jr...
1,vaccinesSaveLives,1300184057733615617,./data_Vaccine/vaccinesSaveLives/tweets5/Egsug...,@CDCgov #vaccine patents – Robert F Kennedy Jr...
2,vaccinesSaveLives,1299603371947483137,./data_Vaccine/vaccinesSaveLives/tweets5/EgkeT...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...
3,vaccinesSaveLives,1299937036640501761,./data_Vaccine/vaccinesSaveLives/tweets5/EgpN0...,#vhspl #Coolgix #BioMeds #blockchaincommunity ...
4,vaccinesSaveLives,1291285670737084416,./data_Vaccine/vaccinesSaveLives/tweets1/EeuRd...,"In love 😍\nThank you, @DrJennersHouse ❤️\n\n#E..."
...,...,...,...,...
5346,VaccinesWork,1292100508476477440,./data_Vaccine/VaccinesWork/tweets2/Ee52kK8XsA...,This is the depressive result of the continued...
5347,VaccinesWork,1292132465633632512,./data_Vaccine/VaccinesWork/tweets2/Ee6ToLoXkA...,Fewer babies and children in London are attend...
5348,VaccinesWork,1292097472270085888,./data_Vaccine/VaccinesWork/tweets2/Ee5zzTrWAA...,Keeping up to date with #vaccinations remains ...
5349,VaccinesWork,1292115859826188288,./data_Vaccine/VaccinesWork/tweets2/Ee6EhwcXgA...,"#Philippines: 102,000 kids vaccinated vs polio..."


### Count the tweets in different hashtags after cleaning

In [11]:
result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
df_csv_cleaned = pd.read_csv(result_cleaned_path, index_col=0)

In [12]:
# count the tweets in different hashtags after cleaning
df_csv_cleaned['hashtag'].value_counts()

CoronavirusVaccine       1413
covidvaccine              851
CovidHoax                 733
VaccinesWork              675
COVID19Vaccine            471
FauciTheFraud             271
FireFauci                 189
scamdemic2020             179
BillGatesBioTerrorist     144
vaccinesSaveLives         112
FauciLiedPeopleDied        76
GetVaccinated              75
depopulation               68
antivaxxers                46
Convid19                   33
VaccineFreedom              9
billgatesofhell             6
Name: hashtag, dtype: int64