In [1]:
# specify the topic name here
topic_name = 'Immigration'

# entry folder for the data
# should be the form of 'data_{topic_name}/'
entry_folder = 'data_Immigration/'

In [2]:
import os
import glob
import pandas as pd
from tqdm import tqdm

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
def get_image_path(image_url, csv_path):
    image_name = image_url.split('/')[-1]
    image_path = './' + csv_path[:-4] + '/' + image_name
    return image_path

In [None]:
df_csv = pd.DataFrame(columns=['hashtag', 'tweet_id', 'image_path', 'body_text'])
df_csv['tweet_id'] = df_csv['tweet_id'].astype('int64')

entries = os.listdir(entry_folder)
for hashtag in entries:
    if hashtag.startswith('.'): continue
    list_files = glob.glob(entry_folder + hashtag + '/*.csv')
    tweets_count = 0
    for csv_path in list_files:
        # print(csv_path)
        temp_df = pd.read_csv(csv_path)
        tweets_count += len(temp_df.index)
        temp_df.drop(columns=['tweet_url'], inplace=True)
        temp_df.rename({'text_data': 'body_text', 'media': 'image_path'}, axis='columns', inplace=True)
        temp_df.insert(loc=0, column='hashtag', value=hashtag)
        cols = temp_df.columns.tolist()
        # before: cols = ['hashtag', 'image_path', 'body_text', 'tweet_id']
        cols = cols[:1] + cols[-1:] + cols[1:-1]
        temp_df = temp_df[cols]
        temp_df['image_path'] = temp_df['image_path'].apply(get_image_path, csv_path=csv_path)
        temp_df['tweet_id'] = temp_df['tweet_id'].astype('int64')
        df_csv = df_csv.append(temp_df)
    print('{:>24s}: {:>5}'.format(hashtag, tweets_count))
print('Topic: {} - Total tweets: {:>5}'.format(topic_name, len(df_csv.index)))

In [None]:
# remove tweets with invalid image path
df_csv['path_is_valid'] = df_csv['image_path'].apply(lambda x: 1 if os.path.isfile(x) else 0)
df_csv = df_csv[df_csv.path_is_valid.eq(1)]
df_csv.drop(columns=['path_is_valid'], inplace=True)

In [None]:
df_csv.reset_index(drop=True, inplace=True)
df_csv

### Filter out images without text

In [None]:
from opencv_text_detector import TextDetector

tqdm.pandas()
detector = TextDetector()

# df_csv['has_text'] = df_csv['image_path'].apply(detector.detect_text)
df_csv['has_text'] = df_csv['image_path'].progress_apply(detector.detect_text)

df_csv

In [None]:
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv.to_csv(result_path)

In [None]:
# filter out those images without text
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv = pd.read_csv(result_path, index_col=0)

df_csv_cleaned = df_csv[df_csv.has_text.eq(1)]
df_csv_cleaned.drop(columns=['has_text'], inplace=True)
df_csv_cleaned.reset_index(drop=True, inplace=True)

result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

In [None]:
df_csv_cleaned

### Count the tweets in different hashtags after cleaning

In [None]:
result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
df_csv_cleaned = pd.read_csv(result_cleaned_path, index_col=0)

In [None]:
# count the tweets in different hashtags after cleaning
df_csv_cleaned['hashtag'].value_counts()

### Filter out duplicate images

In [3]:
result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
cleaned_data = pd.read_csv(result_cleaned_path, index_col=0)
image_paths = cleaned_data['image_path'].to_list()

In [4]:
from remove_duplicates import duplicate_detector
no_duplicate_paths = duplicate_detector(image_paths)

8 is being processed, ./data_Immigration/cdnpoli/tweets2/Ee7xwx4XoAEfN5b.jpg
14450/14678 is being processed, ./data_Immigration/cdnpoli/tweets2/Ee74AMnUcAAzFV9.jpg
14451/14678 is being processed, ./data_Immigration/cdnpoli/tweets2/Ee74SHqUYAAE2sY.jpg
14452/14678 is being processed, ./data_Immigration/cdnpoli/tweets2/Ee8BvXZWoAAFmGq.jpg
14453/14678 is being processed, ./data_Immigration/cdnpoli/tweets2/Ee7sW0LU0AIxsaJ.jpg
14454/14678 is being processed, ./data_Immigration/cdnpoli/tweets2/Ee1hPhXWkAAkYRr.jpg
14455/14678 is being processed, ./data_Immigration/cdnpoli/tweets2/Ee7x9WeUYAUplJN.jpg
14456/14678 is being processed, ./data_Immigration/cdnpoli/tweets2/Ee7asDmX0AABGdY.jpg
14457/14678 is being processed, ./data_Immigration/cdnpoli/tweets2/Ee7w3TpUEAAT3eG.jpg
14458/14678 is being processed, ./data_Immigration/cdnpoli/tweets2/Ee7llELVAAAbCRO.jpg
14459/14678 is being processed, ./data_Immigration/cdnpoli/tweets2/Ee76OcLVoAIPmgx.jpg
14460/14678 is being processed, ./data_Immigration/cd

In [5]:
tqdm.pandas()
cleaned_data['no_dups'] = cleaned_data['image_path'].apply(lambda x:1 if x in no_duplicate_paths else 0)
df_csv_cleaned = cleaned_data[cleaned_data.no_dups.eq(1)]
df_csv_cleaned.drop(columns=['no_dups'], inplace=True)
df_csv_cleaned.reset_index(drop=True, inplace=True)

df_csv_cleaned

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,Antiimmigration,1300061500023898112,./data_Immigration/Antiimmigration/tweets5/Egq...,Editorial cartoon by Dave Granlund\n#Sexism #R...
1,Antiimmigration,1300063612338348034,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by Drew Sheneman\n#Sexism #R...
2,Antiimmigration,1300068611994853377,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by Mike Luckovich\n#Sexism #...
3,Antiimmigration,1300062894328090624,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by Bill Bramhall\n#Sexism #R...
4,Antiimmigration,1300067744419905537,./data_Immigration/Antiimmigration/tweets5/Egr...,Editorial cartoon by David Horsey\n#Sexism #Ra...
...,...,...,...,...
12113,cdnpoli,1291918630075146247,./data_Immigration/cdnpoli/tweets2/Ee3RIs7X0AM...,This has been the case for quite some time. Bu...
12114,cdnpoli,1291928445157388288,./data_Immigration/cdnpoli/tweets2/Ee3aEgjUEAE...,@ReutersScience Come on now.🙄\nLiterally EVERY...
12115,cdnpoli,1291903493075238913,./data_Immigration/cdnpoli/tweets2/Ee3DYJYWsAU...,"We’re launching the CFCP, #canada’s only democ..."
12116,cdnpoli,1291903059120074754,./data_Immigration/cdnpoli/tweets2/Ee3C-eJX0AA...,@CBCPolitics Does your propaganda organization...


In [6]:
result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned_nodups.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

In [7]:
df_csv_cleaned['hashtag'].value_counts()

cdnpoli             6563
immigration         1895
americafirst        1373
refugees            1018
liberals             570
buildthewall         532
multiculturalism      47
illegalaliens         35
MuslimBan             28
stoptheinvasion       14
deport                13
banislam              11
Antiimmigration       11
deportthemall          5
NoBanNoWall            3
Name: hashtag, dtype: int64