In [1]:
# specify the topic name here
topic_name = 'Immigration'

# entry folder for the data
# should be the form of 'data_{topic_name}/'
entry_folder = 'data_Immigration/'

In [2]:
import os
import glob
import pandas as pd
from tqdm import tqdm

In [3]:
def get_image_path(image_url, csv_path):
    image_name = image_url.split('/')[-1]
    image_path = './' + csv_path[:-4] + '/' + image_name
    return image_path

In [4]:
df_csv = pd.DataFrame(columns=['hashtag', 'tweet_id', 'image_path', 'body_text'])
df_csv['tweet_id'] = df_csv['tweet_id'].astype('int64')

entries = os.listdir(entry_folder)
for hashtag in entries:
    if hashtag.startswith('.'): continue
    list_files = glob.glob(entry_folder + hashtag + '/*.csv')
    tweets_count = 0
    for csv_path in list_files:
        # print(csv_path)
        temp_df = pd.read_csv(csv_path)
        tweets_count += len(temp_df.index)
        temp_df.drop(columns=['tweet_url'], inplace=True)
        temp_df.rename({'text_data': 'body_text', 'media': 'image_path'}, axis='columns', inplace=True)
        temp_df.insert(loc=0, column='hashtag', value=hashtag)
        cols = temp_df.columns.tolist()
        # before: cols = ['hashtag', 'image_path', 'body_text', 'tweet_id']
        cols = cols[:1] + cols[-1:] + cols[1:-1]
        temp_df = temp_df[cols]
        temp_df['image_path'] = temp_df['image_path'].apply(get_image_path, csv_path=csv_path)
        temp_df['tweet_id'] = temp_df['tweet_id'].astype('int64')
        df_csv = df_csv.append(temp_df)
    print('{:>24s}: {:>5}'.format(hashtag, tweets_count))
print('Topic: {} - Total tweets: {:>5}'.format(topic_name, len(df_csv.index)))

               bansharia:     3
               MuslimBan:    70
             immigration:  3281
                liberals:   935
                  deport:    25
             NoBanNoWall:     3
            americafirst:  2674
         Antiimmigration:    13
      refuggesnotwelcome:     0
                refugees:  1923
           illegalaliens:    65
            buildthewall:  1373
                banislam:    36
         stoptheinvasion:    25
        multiculturalism:    85
           deportthemall:    11
                 cdnpoli: 11333
Topic: Immigration - Total tweets: 21855


In [5]:
df_csv.reset_index(drop=True, inplace=True)
df_csv

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,bansharia,1290045094125621248,./data_Immigration/bansharia/tweets1/EecpLSkXk...,@Ilhan #DNCisEVIL #DNCisAntiAmerican #BanShari...
1,bansharia,1289590876843290624,./data_Immigration/bansharia/tweets1/EeWMEKGWA...,@ErikaGrove14 @gopoundsand1234 @Ilhan Child br...
2,bansharia,1293161414580826112,./data_Immigration/bansharia/tweets2/EfI7c1ZU0...,#bansharia and #deport these 4 #trump2020 #Tru...
3,MuslimBan,1290772302938943488,./data_Immigration/MuslimBan/tweets1/Eem-jhpWk...,@ElementSerpent @theangiestanton @RyanAFournie...
4,MuslimBan,1290683372759523328,./data_Immigration/MuslimBan/tweets1/EeltloLVo...,"2016, 17, 18, 19, 20. This is the travel ban. ..."
...,...,...,...,...
21850,cdnpoli,1299516876918108160,./data_Immigration/cdnpoli/tweets5/EgjPtlQU8AE...,The Alberta government is reportedly providing...
21851,cdnpoli,1299505580449595392,./data_Immigration/cdnpoli/tweets5/EgjFbkmXkAA...,“You have been well trained my young apprentic...
21852,cdnpoli,1299527942716837888,./data_Immigration/cdnpoli/tweets5/EgedcHZWAAQ...,Paramedics have had to respond to an average o...
21853,cdnpoli,1299521504661639168,./data_Immigration/cdnpoli/tweets5/EgjSl_5WsAE...,"Mr. Erin O'Toole @ErinOTooleMP, where are the ..."


In [6]:

from opencv_text_detector import TextDetector

tqdm.pandas()
detector = TextDetector()

# df_csv['has_text'] = df_csv['image_path'].apply(detector.detect_text)
df_csv['has_text'] = df_csv['image_path'].progress_apply(detector.detect_text)

df_csv

  0%|          | 2/21855 [00:00<1:43:31,  3.52it/s]1 - ./data_Immigration/bansharia/tweets1/EecpLSkXkAAFv6H.jpg
  0%|          | 3/21855 [00:00<1:33:05,  3.91it/s]0 - ./data_Immigration/bansharia/tweets1/EeWMEKGWAAI-Kbz.jpg
  0%|          | 4/21855 [00:01<1:48:47,  3.35it/s]1 - ./data_Immigration/bansharia/tweets2/EfI7c1ZU0AAOV4s.jpg
  0%|          | 5/21855 [00:01<2:00:39,  3.02it/s]1 - ./data_Immigration/MuslimBan/tweets1/Eem-jhpWkAEbqGt.jpg
  0%|          | 6/21855 [00:01<1:59:34,  3.05it/s]0 - ./data_Immigration/MuslimBan/tweets1/EeltloLVoAAXTUh.jpg
  0%|          | 7/21855 [00:02<2:00:13,  3.03it/s]1 - ./data_Immigration/MuslimBan/tweets1/EexhGzdWAAUEWDJ.jpg
  0%|          | 8/21855 [00:02<2:07:00,  2.87it/s]1 - ./data_Immigration/MuslimBan/tweets1/EebqnqCWkAAQ9z0.jpg
  0%|          | 9/21855 [00:02<1:58:54,  3.06it/s]1 - ./data_Immigration/MuslimBan/tweets1/Eev03b3XoAExj8K.jpg
  0%|          | 10/21855 [00:03<2:11:06,  2.78it/s]1 - ./data_Immigration/MuslimBan/tweets2/Ee6PFX4WsAA

KeyboardInterrupt: 

In [None]:
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv.to_csv(result_path)