In [1]:
# specify the topic name here
topic_name = 'Immigration'

# entry folder for the data
# should be the form of 'data_{topic_name}/'
entry_folder = 'data_Immigration/'

In [2]:
import os
import glob
import pandas as pd
from tqdm import tqdm

pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
def get_image_path(image_url, csv_path):
    image_name = image_url.split('/')[-1]
    image_path = './' + csv_path[:-4] + '/' + image_name
    return image_path

In [4]:
df_csv = pd.DataFrame(columns=['hashtag', 'tweet_id', 'image_path', 'body_text'])
df_csv['tweet_id'] = df_csv['tweet_id'].astype('int64')

entries = os.listdir(entry_folder)
for hashtag in entries:
    if hashtag.startswith('.'): continue
    list_files = glob.glob(entry_folder + hashtag + '/*.csv')
    tweets_count = 0
    for csv_path in list_files:
        # print(csv_path)
        temp_df = pd.read_csv(csv_path)
        tweets_count += len(temp_df.index)
        temp_df.drop(columns=['tweet_url'], inplace=True)
        temp_df.rename({'text_data': 'body_text', 'media': 'image_path'}, axis='columns', inplace=True)
        temp_df.insert(loc=0, column='hashtag', value=hashtag)
        cols = temp_df.columns.tolist()
        # before: cols = ['hashtag', 'image_path', 'body_text', 'tweet_id']
        cols = cols[:1] + cols[-1:] + cols[1:-1]
        temp_df = temp_df[cols]
        temp_df['image_path'] = temp_df['image_path'].apply(get_image_path, csv_path=csv_path)
        temp_df['tweet_id'] = temp_df['tweet_id'].astype('int64')
        df_csv = df_csv.append(temp_df)
    print('{:>24s}: {:>5}'.format(hashtag, tweets_count))
print('Topic: {} - Total tweets: {:>5}'.format(topic_name, len(df_csv.index)))

         Antiimmigration:    11
                  deport:    17
      refuggesnotwelcome:     0
         stoptheinvasion:    18
             immigration:  2811
            buildthewall:  1266
           deportthemall:     9
           illegalaliens:    61
            americafirst:  1978
                liberals:   716
                refugees:  1679
               bansharia:     2
               MuslimBan:    42
        multiculturalism:    75
                banislam:    13
             NoBanNoWall:     3
                 cdnpoli:  9986
Topic: Immigration - Total tweets: 18687


In [5]:
# remove tweets with invalid image path
df_csv['path_is_valid'] = df_csv['image_path'].apply(lambda x: 1 if os.path.isfile(x) else 0)
df_csv = df_csv[df_csv.path_is_valid.eq(1)]
df_csv.drop(columns=['path_is_valid'], inplace=True)

In [6]:
df_csv.reset_index(drop=True, inplace=True)
df_csv

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,Antiimmigration,1300061500023898112,./data_Immigration/Antiimmigration/tweets5/Egq...,Editorial cartoon by Dave Granlund\n#Sexism #R...
1,Antiimmigration,1300063612338348034,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by Drew Sheneman\n#Sexism #R...
2,Antiimmigration,1300068611994853377,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by Mike Luckovich\n#Sexism #...
3,Antiimmigration,1300062894328090624,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by Bill Bramhall\n#Sexism #R...
4,Antiimmigration,1300067744419905537,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by David Horsey\n#Sexism #Ra...
...,...,...,...,...
18682,cdnpoli,1291928445157388288,./data_Immigration/cdnpoli/tweets2/Ee3aEgjUEAE...,@ReutersScience Come on now.🙄\nLiterally EVERY...
18683,cdnpoli,1291903493075238913,./data_Immigration/cdnpoli/tweets2/Ee3DYJYWsAU...,"We’re launching the CFCP, #canada’s only democ..."
18684,cdnpoli,1291903059120074754,./data_Immigration/cdnpoli/tweets2/Ee3C-eJX0AA...,@CBCPolitics Does your propaganda organization...
18685,cdnpoli,1291905776907296768,./data_Immigration/cdnpoli/tweets2/Ee3FYo8UcAE...,@JonDziadyk @CityofEdmonton #cdnpoli #abpoli F...


### Filter out images without text

In [7]:
from opencv_text_detector import TextDetector

tqdm.pandas()
detector = TextDetector()

# df_csv['has_text'] = df_csv['image_path'].apply(detector.detect_text)
df_csv['has_text'] = df_csv['image_path'].progress_apply(detector.detect_text)

df_csv

/Ee7SzUqX0AEXWk6.jpg
 99%|█████████▊| 18452/18687 [33:21<00:25,  9.34it/s]1 - ./data_Immigration/cdnpoli/tweets2/Ee7I6fLU8AA_M67.jpg
1 - ./data_Immigration/cdnpoli/tweets2/Ee7SoHyXsAEZFAL.jpg
 99%|█████████▉| 18454/18687 [33:21<00:24,  9.48it/s]1 - ./data_Immigration/cdnpoli/tweets2/Ee7Tp-xWAAI_tZY.jpg
1 - ./data_Immigration/cdnpoli/tweets2/Ee7NWqqXoAE8G78.png
 99%|█████████▉| 18456/18687 [33:21<00:24,  9.59it/s]1 - ./data_Immigration/cdnpoli/tweets2/Ee63jlRVAAAPmlm.png
1 - ./data_Immigration/cdnpoli/tweets2/Ee6_AXUVoAA1yRE.jpg
 99%|█████████▉| 18458/18687 [33:21<00:24,  9.40it/s]1 - ./data_Immigration/cdnpoli/tweets2/Ee7CfGvXgAU-pV0.jpg
1 - ./data_Immigration/cdnpoli/tweets2/Ee63Ey4U4AEuaJU.jpg
1 - ./data_Immigration/cdnpoli/tweets2/Ee7R5KLUYAA78CT.jpg
 99%|█████████▉| 18461/18687 [33:22<00:24,  9.39it/s]1 - ./data_Immigration/cdnpoli/tweets2/Ee7ZQJcU4AAO2Ga.png
1 - ./data_Immigration/cdnpoli/tweets2/Ee7PYNXUYAIf3Nm.png
 99%|█████████▉| 18463/18687 [33:22<00:24,  9.21it/s]1 - ./data_I

Unnamed: 0,hashtag,tweet_id,image_path,body_text,has_text
0,Antiimmigration,1300061500023898112,./data_Immigration/Antiimmigration/tweets5/Egq...,Editorial cartoon by Dave Granlund\n#Sexism #R...,1
1,Antiimmigration,1300063612338348034,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by Drew Sheneman\n#Sexism #R...,1
2,Antiimmigration,1300068611994853377,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by Mike Luckovich\n#Sexism #...,1
3,Antiimmigration,1300062894328090624,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by Bill Bramhall\n#Sexism #R...,1
4,Antiimmigration,1300067744419905537,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by David Horsey\n#Sexism #Ra...,1
...,...,...,...,...,...
18682,cdnpoli,1291928445157388288,./data_Immigration/cdnpoli/tweets2/Ee3aEgjUEAE...,@ReutersScience Come on now.🙄\nLiterally EVERY...,1
18683,cdnpoli,1291903493075238913,./data_Immigration/cdnpoli/tweets2/Ee3DYJYWsAU...,"We’re launching the CFCP, #canada’s only democ...",1
18684,cdnpoli,1291903059120074754,./data_Immigration/cdnpoli/tweets2/Ee3C-eJX0AA...,@CBCPolitics Does your propaganda organization...,1
18685,cdnpoli,1291905776907296768,./data_Immigration/cdnpoli/tweets2/Ee3FYo8UcAE...,@JonDziadyk @CityofEdmonton #cdnpoli #abpoli F...,1


In [8]:
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv.to_csv(result_path)

In [9]:
# filter out those images without text
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv = pd.read_csv(result_path, index_col=0)

df_csv_cleaned = df_csv[df_csv.has_text.eq(1)]
df_csv_cleaned.drop(columns=['has_text'], inplace=True)
df_csv_cleaned.reset_index(drop=True, inplace=True)

result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

In [10]:
df_csv_cleaned

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,Antiimmigration,1300061500023898112,./data_Immigration/Antiimmigration/tweets5/Egq...,Editorial cartoon by Dave Granlund\n#Sexism #R...
1,Antiimmigration,1300063612338348034,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by Drew Sheneman\n#Sexism #R...
2,Antiimmigration,1300068611994853377,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by Mike Luckovich\n#Sexism #...
3,Antiimmigration,1300062894328090624,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by Bill Bramhall\n#Sexism #R...
4,Antiimmigration,1300067744419905537,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by David Horsey\n#Sexism #Ra...
...,...,...,...,...
14673,cdnpoli,1291918630075146247,./data_Immigration/cdnpoli/tweets2/Ee3RIs7X0AM...,This has been the case for quite some time. Bu...
14674,cdnpoli,1291928445157388288,./data_Immigration/cdnpoli/tweets2/Ee3aEgjUEAE...,@ReutersScience Come on now.🙄\nLiterally EVERY...
14675,cdnpoli,1291903493075238913,./data_Immigration/cdnpoli/tweets2/Ee3DYJYWsAU...,"We’re launching the CFCP, #canada’s only democ..."
14676,cdnpoli,1291903059120074754,./data_Immigration/cdnpoli/tweets2/Ee3C-eJX0AA...,@CBCPolitics Does your propaganda organization...


### Count the tweets in different hashtags after cleaning

In [11]:
result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
df_csv_cleaned = pd.read_csv(result_cleaned_path, index_col=0)

In [12]:
# count the tweets in different hashtags after cleaning
df_csv_cleaned['hashtag'].value_counts()

cdnpoli             7782
immigration         2325
americafirst        1665
refugees            1234
buildthewall         853
liberals             602
multiculturalism      62
illegalaliens         52
MuslimBan             37
stoptheinvasion       18
deport                14
Antiimmigration       11
banislam              11
deportthemall          8
NoBanNoWall            3
bansharia              1
Name: hashtag, dtype: int64