In [1]:
# specify the topic name here
topic_name = 'Immigration_hate'

# entry folder for the data
# should be the form of 'data_{topic_name}/'
entry_folder = 'data_' + topic_name + '/'

# csv data folder name
csv_folder_name = 'csv_data_hate'

In [2]:
import os
import glob
import pandas as pd
from tqdm.notebook import tqdm

pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
def get_image_path(image_url, csv_path):
    image_name = image_url.split('/')[-1]
    image_path = './' + csv_path[:-4] + '/' + image_name
    return image_path

In [4]:
df_csv = pd.DataFrame(columns=['hashtag', 'tweet_id', 'image_path', 'body_text'])
df_csv['tweet_id'] = df_csv['tweet_id'].astype('int64')

entries = os.listdir(entry_folder)
for hashtag in entries:
    if hashtag.startswith('.'): continue
    list_files = glob.glob(entry_folder + hashtag + '/*.csv')
    tweets_count = 0
    for csv_path in list_files:
        # print(csv_path)
        temp_df = pd.read_csv(csv_path)
        tweets_count += len(temp_df.index)
        temp_df.drop(columns=['tweet_url'], inplace=True)
        temp_df.rename({'text_data': 'body_text', 'media': 'image_path'}, axis='columns', inplace=True)
        temp_df.insert(loc=0, column='hashtag', value=hashtag)
        cols = temp_df.columns.tolist()
        # before: cols = ['hashtag', 'image_path', 'body_text', 'tweet_id']
        cols = cols[:1] + cols[-1:] + cols[1:-1]
        temp_df = temp_df[cols]
        temp_df['image_path'] = temp_df['image_path'].apply(get_image_path, csv_path=csv_path)
        temp_df['tweet_id'] = temp_df['tweet_id'].astype('int64')
        df_csv = df_csv.append(temp_df)
    print('{:>30s}: {:>5}'.format(hashtag, tweets_count))
print('Topic: {} - Total tweets: {:>5}'.format(topic_name, len(df_csv.index)))

             illegalimmigrants:    83
            illegalimmigration:    48
                        deport:    17
                 illegalaliens:    60
                  BuildTheWall:  1262
               stoptheinvasion:    18
                  BorderCrisis:    15
                 DeportThemAll:     9
Topic: Immigration_hate - Total tweets:  1512


In [5]:
# remove tweets with invalid image path
df_csv['path_is_valid'] = df_csv['image_path'].apply(lambda x: 1 if os.path.isfile(x) else 0)
df_csv = df_csv[df_csv.path_is_valid.eq(1)]
df_csv.drop(columns=['path_is_valid'], inplace=True)

In [6]:
df_csv.reset_index(drop=True, inplace=True)
df_csv

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,ChiNazi,1300118461918507008,./data_AsianHate_hate/ChiNazi/tweets5/Egry12bU...,@nytimesworld @kerokero_HKer Who can save thes...
1,ChiNazi,1300110586257137664,./data_AsianHate_hate/ChiNazi/tweets5/EgrrrteU...,@ChannelNewsAsia @kerokero_HKer How evil can t...
2,ChiNazi,1300119648877199360,./data_AsianHate_hate/ChiNazi/tweets5/Egrz678U...,@TomTugendhat @kerokero_HKer @AmbLiuXiaoMing S...
3,ChiNazi,1300119389534973954,./data_AsianHate_hate/ChiNazi/tweets5/Egrzr3uU...,@Reuters @kerokero_HKer Shamelessly evil #Tyra...
4,ChiNazi,1300110716150509568,./data_AsianHate_hate/ChiNazi/tweets5/EgrrzSRU...,@Reuters @kerokero_HKer How evil can the #Tyra...
...,...,...,...,...
12787,ChineseVirus,1291908465930252290,./data_AsianHate_hate/ChineseVirus/tweets2/Ee3...,@Acosta Poor little Jimmy he can dish it out b...
12788,ChineseVirus,1292469767694168065,./data_AsianHate_hate/ChineseVirus/tweets2/Ee_...,@Acosta The great minds in the world are tryin...
12789,ChineseVirus,1292217084626243587,./data_AsianHate_hate/ChineseVirus/tweets2/Ee7...,"2 simalir events.\nOne pro America, police\nOt..."
12790,ChineseVirus,1292068048724164610,./data_AsianHate_hate/ChineseVirus/tweets2/Ee5...,"Next video of our channel\n\n""Yours SRK""\n@You..."


### Filter out images without text

In [7]:
from opencv_text_detector import TextDetector

tqdm.pandas()
detector = TextDetector()

# df_csv['has_text'] = df_csv['image_path'].apply(detector.detect_text)
df_csv['has_text'] = df_csv['image_path'].progress_apply(detector.detect_text)

df_csv

  0%|          | 0/12792 [00:00<?, ?it/s]

1 - ./data_AsianHate_hate/ChiNazi/tweets5/Egry12bU8AAAVwn.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/EgrrrteUcAI6dmH.jpg
1 - ./data_AsianHate_hate/ChiNazi/tweets5/Egrz678UwAINxCF.jpg
1 - ./data_AsianHate_hate/ChiNazi/tweets5/Egrzr3uU4AAlakT.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/EgrrzSRU4AAM4BV.jpg
1 - ./data_AsianHate_hate/ChiNazi/tweets5/EgrzucXUcAIxL4V.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/EgrrXn3UYAAoHAk.jpg
1 - ./data_AsianHate_hate/ChiNazi/tweets5/EgrzSy9VgAA0GGS.jpg
1 - ./data_AsianHate_hate/ChiNazi/tweets5/EgrzjelUYAAlCa3.jpg
1 - ./data_AsianHate_hate/ChiNazi/tweets5/Egrz4hJVgAEXQgr.jpg
1 - ./data_AsianHate_hate/ChiNazi/tweets5/EgrzKMBU8AAxVJJ.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/Egrriu4UMAA3oQC.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/EgrrnS0UcAAMVOG.jpg
1 - ./data_AsianHate_hate/ChiNazi/tweets5/EgrzXwOU0AEViLV.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/Egrr1GpU4AEqx8M.jpg
1 - ./data_AsianHate_hate/ChiNazi/tweets5/EgrzlKRVkAAl5_6.jpg
1 - ./da

Unnamed: 0,hashtag,tweet_id,image_path,body_text,has_text
0,ChiNazi,1300118461918507008,./data_AsianHate_hate/ChiNazi/tweets5/Egry12bU...,@nytimesworld @kerokero_HKer Who can save thes...,1
1,ChiNazi,1300110586257137664,./data_AsianHate_hate/ChiNazi/tweets5/EgrrrteU...,@ChannelNewsAsia @kerokero_HKer How evil can t...,0
2,ChiNazi,1300119648877199360,./data_AsianHate_hate/ChiNazi/tweets5/Egrz678U...,@TomTugendhat @kerokero_HKer @AmbLiuXiaoMing S...,1
3,ChiNazi,1300119389534973954,./data_AsianHate_hate/ChiNazi/tweets5/Egrzr3uU...,@Reuters @kerokero_HKer Shamelessly evil #Tyra...,1
4,ChiNazi,1300110716150509568,./data_AsianHate_hate/ChiNazi/tweets5/EgrrzSRU...,@Reuters @kerokero_HKer How evil can the #Tyra...,0
...,...,...,...,...,...
12787,ChineseVirus,1291908465930252290,./data_AsianHate_hate/ChineseVirus/tweets2/Ee3...,@Acosta Poor little Jimmy he can dish it out b...,1
12788,ChineseVirus,1292469767694168065,./data_AsianHate_hate/ChineseVirus/tweets2/Ee_...,@Acosta The great minds in the world are tryin...,1
12789,ChineseVirus,1292217084626243587,./data_AsianHate_hate/ChineseVirus/tweets2/Ee7...,"2 simalir events.\nOne pro America, police\nOt...",1
12790,ChineseVirus,1292068048724164610,./data_AsianHate_hate/ChineseVirus/tweets2/Ee5...,"Next video of our channel\n\n""Yours SRK""\n@You...",1


In [9]:
result_path = './'+ csv_folder_name + '/data_' + topic_name + '.csv'
df_csv.to_csv(result_path)

In [10]:
# filter out those images without text
result_path = './'+ csv_folder_name + '/data_' + topic_name + '.csv'
df_csv = pd.read_csv(result_path, index_col=0)

df_csv_cleaned = df_csv[df_csv.has_text.eq(1)]
df_csv_cleaned.drop(columns=['has_text'], inplace=True)
df_csv_cleaned.reset_index(drop=True, inplace=True)

result_cleaned_path = './'+ csv_folder_name + '/data_' + topic_name + '_wtext.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

In [11]:
df_csv_cleaned

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,ChiNazi,1300118461918507008,./data_AsianHate_hate/ChiNazi/tweets5/Egry12bU...,@nytimesworld @kerokero_HKer Who can save thes...
1,ChiNazi,1300119648877199360,./data_AsianHate_hate/ChiNazi/tweets5/Egrz678U...,@TomTugendhat @kerokero_HKer @AmbLiuXiaoMing S...
2,ChiNazi,1300119389534973954,./data_AsianHate_hate/ChiNazi/tweets5/Egrzr3uU...,@Reuters @kerokero_HKer Shamelessly evil #Tyra...
3,ChiNazi,1300119434355326976,./data_AsianHate_hate/ChiNazi/tweets5/EgrzucXU...,@ReutersUK @kerokero_HKer Shamelessly evil #Ty...
4,ChiNazi,1300118959048421376,./data_AsianHate_hate/ChiNazi/tweets5/EgrzSy9V...,@AlexandreKrausz @kerokero_HKer Shamelessly ev...
...,...,...,...,...
10565,ChineseVirus,1291908465930252290,./data_AsianHate_hate/ChineseVirus/tweets2/Ee3...,@Acosta Poor little Jimmy he can dish it out b...
10566,ChineseVirus,1292469767694168065,./data_AsianHate_hate/ChineseVirus/tweets2/Ee_...,@Acosta The great minds in the world are tryin...
10567,ChineseVirus,1292217084626243587,./data_AsianHate_hate/ChineseVirus/tweets2/Ee7...,"2 simalir events.\nOne pro America, police\nOt..."
10568,ChineseVirus,1292068048724164610,./data_AsianHate_hate/ChineseVirus/tweets2/Ee5...,"Next video of our channel\n\n""Yours SRK""\n@You..."


In [12]:
result_cleaned_path = './'+ csv_folder_name + '/data_' + topic_name + '_wtext.csv'
df_csv_cleaned = pd.read_csv(result_cleaned_path, index_col=0)

In [13]:
# count the tweets in different hashtags after cleaning
df_csv_cleaned['hashtag'].value_counts()

BoycottChina           2629
ChiNazi                1872
CCP_is_terrorist       1597
CCPVirus               1225
ChinaVirus             1084
Wuhanvirus              983
ChineseVirus            462
ChinaLiedPeopleDied     358
CCPLiedPeopleDied       226
MakeChinaPay            111
WuhanLab                 23
Name: hashtag, dtype: int64

### Filter out duplicate images

In [3]:
result_cleaned_path = './'+ csv_folder_name + '/data_' + topic_name + '_wtext.csv'
cleaned_data = pd.read_csv(result_cleaned_path, index_col=0)
image_paths = cleaned_data['image_path'].to_list()

In [4]:
from remove_duplicates import duplicate_detector
no_duplicate_paths = duplicate_detector(image_paths)

1/10570 is being processed, ./data_AsianHate_hate/ChiNazi/tweets5/Egry12bU8AAAVwn.jpg
2/10570 is being processed, ./data_AsianHate_hate/ChiNazi/tweets5/Egrz678UwAINxCF.jpg
3/10570 is being processed, ./data_AsianHate_hate/ChiNazi/tweets5/Egrzr3uU4AAlakT.jpg
4/10570 is being processed, ./data_AsianHate_hate/ChiNazi/tweets5/EgrzucXUcAIxL4V.jpg
5/10570 is being processed, ./data_AsianHate_hate/ChiNazi/tweets5/EgrzSy9VgAA0GGS.jpg
6/10570 is being processed, ./data_AsianHate_hate/ChiNazi/tweets5/EgrzjelUYAAlCa3.jpg
7/10570 is being processed, ./data_AsianHate_hate/ChiNazi/tweets5/Egrz4hJVgAEXQgr.jpg
8/10570 is being processed, ./data_AsianHate_hate/ChiNazi/tweets5/EgrzKMBU8AAxVJJ.jpg
9/10570 is being processed, ./data_AsianHate_hate/ChiNazi/tweets5/EgrzXwOU0AEViLV.jpg
10/10570 is being processed, ./data_AsianHate_hate/ChiNazi/tweets5/EgrzlKRVkAAl5_6.jpg
11/10570 is being processed, ./data_AsianHate_hate/ChiNazi/tweets5/Egr7VfTU8AAcmUW.jpg
12/10570 is being processed, ./data_AsianHate_hate/C

In [5]:
tqdm.pandas()
cleaned_data['no_dups'] = cleaned_data['image_path'].apply(lambda x:1 if x in no_duplicate_paths else 0)
df_csv_cleaned = cleaned_data[cleaned_data.no_dups.eq(1)]
df_csv_cleaned.drop(columns=['no_dups'], inplace=True)
df_csv_cleaned.reset_index(drop=True, inplace=True)

df_csv_cleaned

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,ChiNazi,1300118461918507008,./data_AsianHate_hate/ChiNazi/tweets5/Egry12bU...,@nytimesworld @kerokero_HKer Who can save thes...
1,ChiNazi,1300128001271590912,./data_AsianHate_hate/ChiNazi/tweets5/Egr7VfTU...,so why is he tweeting with an iPhone then?\n #...
2,ChiNazi,1300106282037174273,./data_AsianHate_hate/ChiNazi/tweets5/EgrnxMDU...,@AFP How evil can the #Tyranny be?\n\n.\n\n#Ho...
3,ChiNazi,1300093617105702915,./data_AsianHate_hate/ChiNazi/tweets5/EgrcOm8X...,@JohnCornyn From IP stealing to violating int’...
4,ChiNazi,1299915619404406785,./data_AsianHate_hate/ChiNazi/tweets5/Ego6U62V...,@nytimesworld #AsylumSeekers #HongKongers4No...
...,...,...,...,...
4869,ChineseVirus,1291908465930252290,./data_AsianHate_hate/ChineseVirus/tweets2/Ee3...,@Acosta Poor little Jimmy he can dish it out b...
4870,ChineseVirus,1292469767694168065,./data_AsianHate_hate/ChineseVirus/tweets2/Ee_...,@Acosta The great minds in the world are tryin...
4871,ChineseVirus,1292217084626243587,./data_AsianHate_hate/ChineseVirus/tweets2/Ee7...,"2 simalir events.\nOne pro America, police\nOt..."
4872,ChineseVirus,1292068048724164610,./data_AsianHate_hate/ChineseVirus/tweets2/Ee5...,"Next video of our channel\n\n""Yours SRK""\n@You..."


In [6]:
result_cleaned_path = './'+ csv_folder_name + '/data_' + topic_name + '_wtext_nodups.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

In [7]:
df_csv_cleaned['hashtag'].value_counts()

BoycottChina           892
ChinaVirus             851
CCP_is_terrorist       691
ChiNazi                641
Wuhanvirus             590
CCPVirus               549
ChineseVirus           329
ChinaLiedPeopleDied    150
CCPLiedPeopleDied       98
MakeChinaPay            70
WuhanLab                13
Name: hashtag, dtype: int64

### Filter out images with no objects

In [3]:
csv_path = './'+ csv_folder_name + '/data_' + topic_name + '_wtext_nodups.csv'
df_read = pd.read_csv(csv_path, index_col=0)

In [4]:
from yolo_object_detector import ObjectDetector

tqdm.pandas()
detector = ObjectDetector()

df_read['has_object'] = df_read['image_path'].progress_apply(detector.detect_object)

df_read

[INFO]  Loading YOLO from disk...


  0%|          | 0/4874 [00:00<?, ?it/s]

person - ./data_AsianHate_hate/ChiNazi/tweets5/Egry12bU8AAAVwn.jpg
person - ./data_AsianHate_hate/ChiNazi/tweets5/Egr7VfTU8AAcmUW.jpg
person - ./data_AsianHate_hate/ChiNazi/tweets5/EgrnxMDUwAACdQu.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/EgrcOm8XcAQOaeS.jpg
traffic light - ./data_AsianHate_hate/ChiNazi/tweets5/Ego6U62VoAYkAWs.jpg
person - ./data_AsianHate_hate/ChiNazi/tweets5/EgqXl0zUcAEiVO6.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/Egq-sbUUwAAy1w8.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/Egq6xR5XYAEx7qk.jpg
chair - ./data_AsianHate_hate/ChiNazi/tweets5/EgoF4RtVgAEKi7S.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/EgqJ7zTVoAItMRT.jpg
person - ./data_AsianHate_hate/ChiNazi/tweets5/EgpVXoUUYAUkRkx.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/Egq0l34WsAIbe0i.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/EgqWubYU8AAjivi.jpg
person - ./data_AsianHate_hate/ChiNazi/tweets5/EgrTPQqX0AEQZ7Q.jpg
0 - ./data_AsianHate_hate/ChiNazi/tweets5/EgosvQeU4AAukGa.jpg
0 - ./data_AsianHate_hat

Unnamed: 0,hashtag,tweet_id,image_path,body_text,has_object
0,ChiNazi,1300118461918507008,./data_AsianHate_hate/ChiNazi/tweets5/Egry12bU...,@nytimesworld @kerokero_HKer Who can save thes...,1
1,ChiNazi,1300128001271590912,./data_AsianHate_hate/ChiNazi/tweets5/Egr7VfTU...,so why is he tweeting with an iPhone then?\n #...,1
2,ChiNazi,1300106282037174273,./data_AsianHate_hate/ChiNazi/tweets5/EgrnxMDU...,@AFP How evil can the #Tyranny be?\n\n.\n\n#Ho...,1
3,ChiNazi,1300093617105702915,./data_AsianHate_hate/ChiNazi/tweets5/EgrcOm8X...,@JohnCornyn From IP stealing to violating int’...,0
4,ChiNazi,1299915619404406785,./data_AsianHate_hate/ChiNazi/tweets5/Ego6U62V...,@nytimesworld #AsylumSeekers #HongKongers4No...,1
...,...,...,...,...,...
4869,ChineseVirus,1291908465930252290,./data_AsianHate_hate/ChineseVirus/tweets2/Ee3...,@Acosta Poor little Jimmy he can dish it out b...,0
4870,ChineseVirus,1292469767694168065,./data_AsianHate_hate/ChineseVirus/tweets2/Ee_...,@Acosta The great minds in the world are tryin...,1
4871,ChineseVirus,1292217084626243587,./data_AsianHate_hate/ChineseVirus/tweets2/Ee7...,"2 simalir events.\nOne pro America, police\nOt...",1
4872,ChineseVirus,1292068048724164610,./data_AsianHate_hate/ChineseVirus/tweets2/Ee5...,"Next video of our channel\n\n""Yours SRK""\n@You...",0


In [5]:
df_with_obj = df_read[df_read.has_object.eq(1)]
df_with_obj.drop(columns=['has_object'], inplace=True)
df_with_obj.reset_index(drop=True, inplace=True)

result_cleaned_path = './'+ csv_folder_name + '/data_' + topic_name + '_wtext_nodups_wobj.csv'
df_with_obj.to_csv(result_cleaned_path)

In [6]:
# count the tweets in different hashtags after cleaning
df_with_obj['hashtag'].value_counts()

ChinaVirus             490
BoycottChina           460
CCP_is_terrorist       428
ChiNazi                381
Wuhanvirus             339
CCPVirus               285
ChineseVirus           185
ChinaLiedPeopleDied     89
CCPLiedPeopleDied       46
MakeChinaPay            34
WuhanLab                 8
Name: hashtag, dtype: int64

In [7]:
df_with_obj

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,ChiNazi,1300118461918507008,./data_AsianHate_hate/ChiNazi/tweets5/Egry12bU...,@nytimesworld @kerokero_HKer Who can save thes...
1,ChiNazi,1300128001271590912,./data_AsianHate_hate/ChiNazi/tweets5/Egr7VfTU...,so why is he tweeting with an iPhone then?\n #...
2,ChiNazi,1300106282037174273,./data_AsianHate_hate/ChiNazi/tweets5/EgrnxMDU...,@AFP How evil can the #Tyranny be?\n\n.\n\n#Ho...
3,ChiNazi,1299915619404406785,./data_AsianHate_hate/ChiNazi/tweets5/Ego6U62V...,@nytimesworld #AsylumSeekers #HongKongers4No...
4,ChiNazi,1300018124503146496,./data_AsianHate_hate/ChiNazi/tweets5/EgqXl0zU...,@breannamorello Too much hypocrites in US. It ...
...,...,...,...,...
2740,ChineseVirus,1291982019795116032,./data_AsianHate_hate/ChineseVirus/tweets2/Ee4...,every time china launches new virus.\n#coronav...
2741,ChineseVirus,1292018403037896704,./data_AsianHate_hate/ChineseVirus/tweets2/Ee4...,#COVID__19 tried and failed! Is there anythin...
2742,ChineseVirus,1292469767694168065,./data_AsianHate_hate/ChineseVirus/tweets2/Ee_...,@Acosta The great minds in the world are tryin...
2743,ChineseVirus,1292217084626243587,./data_AsianHate_hate/ChineseVirus/tweets2/Ee7...,"2 simalir events.\nOne pro America, police\nOt..."


## Extracting text from memes

In [1]:
# specify the topic name here
topic_name = 'Vaccine_hate'

# entry folder for the data
# should be the form of 'data_{topic_name}/'
entry_folder = entry_folder = 'data_' + topic_name + '/'

# csv data folder name
csv_folder_name = 'csv_data_hate'

In [2]:
import pandas as pd

csv_path = './'+ csv_folder_name + '/data_' + topic_name + '_wtext_nodups_wobj.csv'
df_csv = pd.read_csv(csv_path, index_col=0)

In [3]:
from tesseract_ocr import raw_ocr
from tqdm.notebook import tqdm
pd.options.mode.chained_assignment = None  # default='warn'
tqdm.pandas()

df_csv['text_with_OCR'] = df_csv['image_path'].progress_apply(raw_ocr)

  0%|          | 0/2745 [00:00<?, ?it/s]

In [5]:
result_path = './'+ csv_folder_name + '/data_' + topic_name + '_localOCR.csv'
df_csv.to_csv(result_path)

### Cleaning the OCR results

In [1]:
# specify the topic name here
topic_name = 'Vaccine_hate'

# csv data folder name
csv_folder_name = 'csv_data_hate'

In [2]:
import re
import nltk
import spacy
from tqdm.notebook import tqdm
import pandas as pd

tqdm.pandas()

csv_path = './'+ csv_folder_name + '/data_' + topic_name + '_localOCR.csv'
df_csv = pd.read_csv(csv_path, index_col=0)
# df_csv.drop(columns=['image_text_cleaned'], inplace=True)
# df_csv.reset_index(drop=True, inplace=True)
# df_csv.to_csv(csv_path)

In [4]:
# nltk.download('words')
words = set(nltk.corpus.words.words())
def remove_non_english(sent):
    return ' '.join(w for w in nltk.wordpunct_tokenize(sent) \
     if w.lower() in words or not w.isalpha())

In [3]:
nlp = spacy.load('en_core_web_md',disable = ['parser','ner']) # remove three tasks so as to speed up the precess 
nlp.max_length = 1100000

def nlp_preprocess(caption):
    doc = nlp(caption)
    lemmanized_list = []
    lemmanized_phrase = ""
    for token in doc:
        if not token.is_punct and not token.is_stop and not token.is_oov: # check is token is not punctutation stop word and in the nlp vocab
            lemmanized_list.append(token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.text) 
    lemmanized_phrase = ' '.join(lemmanized_list)
    return lemmanized_phrase

In [4]:
# remove line breakers
df_csv['text_with_OCR'] = df_csv['text_with_OCR'].apply(lambda x: x.replace('\n', ' '))
# remove all special characters
# df_csv['text_with_OCR'] = df_csv['text_with_OCR'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x))
# tokenize, remove words that are shorted than two characters ,lemmatizer, removing stopwords, and stemming
df_csv['text_with_OCR'] = df_csv['text_with_OCR'].progress_apply(nlp_preprocess)
# remove any single character
df_csv['text_with_OCR'] = df_csv['text_with_OCR'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))
# remove non-English words
# df_csv['text_with_OCR'] = df_csv['text_with_OCR'].apply(remove_non_english)
# remove words that have digits in them
# df_csv['text_with_OCR'] = df_csv['text_with_OCR'].apply(lambda c: ' '.join(w for w in c.split() if not any(x.isdigit() for x in w)))

  0%|          | 0/3301 [00:00<?, ?it/s]

In [5]:
df_csv['word_count'] = df_csv['text_with_OCR'].apply(lambda x: len(re.findall(r'\w+', x)))
df_csv = df_csv[df_csv.word_count.lt(21)]

In [6]:
def is_nonsense_detection(text):
    total_words = len(re.findall(r'\w+', text))
    # total words equal to 0 may means a failure detection - keep it
    if total_words == 0: return 1
    # if a detectiion contains long word(s), consider it as a valid detection
    long_words = len([word for word in text.split() if len(word)>=5])
    if long_words > 0: return 0
    # if nonsense words dominates the detection, discard it
    invalid_words = len([word for word in text.split() if len(word)<=3])
    invalid_prop = invalid_words / total_words
    if invalid_prop > 0.75 and total_words > 0: return 1
    else: return 0

In [7]:
df_csv['is_invalid'] = df_csv['text_with_OCR'].apply(is_nonsense_detection)
df_csv

Unnamed: 0,hashtag,tweet_id,image_path,body_text,text_with_OCR,word_count,is_invalid
0,antivaxx,1299806915418038272,./data_Vaccine_hate/antivaxx/tweets5/EgnXgEJXg...,“Our landlord just parked this out front... ti...,eta yi vaccine te share story org ww ee ee,10,0
1,antivaxx,1299778565723086848,./data_Vaccine_hate/antivaxx/tweets5/Egm9Rt2Ws...,Inside the .@metpoliceuk cordon at the entranc...,ny te pont oe se ae ea os sy hey mney stock photo,13,0
2,antivaxx,1299781802790457344,./data_Vaccine_hate/antivaxx/tweets5/EgnASaTWk...,Anti-lockdown protest in #London #coronavirus ...,bg po mat eae ga se ps nd,8,1
4,antivaxx,1299781325155643393,./data_Vaccine_hate/antivaxx/tweets5/Egm986JWk...,Anti-lockdown protest in #London #coronavirus ...,fe ae ew masks wile new fascism yo fear ax ss ...,16,0
6,antivaxx,1299889942731071488,./data_Vaccine_hate/antivaxx/tweets5/EgojA81WA...,“My family on insta. Thanks for the advice ......,people announces congratulation solera va baby...,10,0
...,...,...,...,...,...,...,...
3291,FauciLiedPeopleDied,1297670692813635584,./data_Vaccine_hate/FauciLiedPeopleDied/tweets...,@zev_dr #FauciLiedPeopleDied \nThe institution...,ls ad oe lins op,5,1
3292,FauciLiedPeopleDied,1297833285075251202,./data_Vaccine_hate/FauciLiedPeopleDied/tweets...,"Dr. Fauci allowed 17,000 aids patients to die ...",photo ad 1981 know bs yd io na pr fav une,11,0
3294,FauciLiedPeopleDied,1294643614732222464,./data_Vaccine_hate/FauciLiedPeopleDied/tweets...,"Who do you trust more, 🤔 \nDr. Pepper or Dr. ...",sy ee ion,3,1
3295,FauciLiedPeopleDied,1294603353251549184,./data_Vaccine_hate/FauciLiedPeopleDied/tweets...,"He has destroyed many professional, like Dr Ju...",learn wy blau example vis muu ll cht az ee ss ...,12,0


In [8]:
df_csv_cleaned = df_csv[df_csv.is_invalid.eq(0)]
df_csv_cleaned.drop(columns=['word_count', 'is_invalid'], inplace=True)
df_csv_cleaned.reset_index(drop=True, inplace=True)
df_csv_cleaned

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,hashtag,tweet_id,image_path,body_text,text_with_OCR
0,antivaxx,1299806915418038272,./data_Vaccine_hate/antivaxx/tweets5/EgnXgEJXg...,“Our landlord just parked this out front... ti...,eta yi vaccine te share story org ww ee ee
1,antivaxx,1299778565723086848,./data_Vaccine_hate/antivaxx/tweets5/Egm9Rt2Ws...,Inside the .@metpoliceuk cordon at the entranc...,ny te pont oe se ae ea os sy hey mney stock photo
2,antivaxx,1299781325155643393,./data_Vaccine_hate/antivaxx/tweets5/Egm986JWk...,Anti-lockdown protest in #London #coronavirus ...,fe ae ew masks wile new fascism yo fear ax ss ...
3,antivaxx,1299889942731071488,./data_Vaccine_hate/antivaxx/tweets5/EgojA81WA...,“My family on insta. Thanks for the advice ......,people announces congratulation solera va baby...
4,antivaxx,1299782536013508609,./data_Vaccine_hate/antivaxx/tweets5/EgnAydOWs...,Anti-lockdown protest in #London #coronavirus ...,save execute ae ow yes iii sy
...,...,...,...,...,...
1164,FauciLiedPeopleDied,1290234311472537605,./data_Vaccine_hate/FauciLiedPeopleDied/tweets...,@DrAnthonyF knew #Hydroxychloroquneworks but w...,ift lift quarantine oe le ve sa ak trump rally...
1165,FauciLiedPeopleDied,1290648550901264384,./data_Vaccine_hate/FauciLiedPeopleDied/tweets...,WoW! 😡 🤬 Will any sane person (key word sane) ...,1500000 deaths es
1166,FauciLiedPeopleDied,1291389647839531010,./data_Vaccine_hate/FauciLiedPeopleDied/tweets...,"@kenedisco Try google search ""Dr Zelenko Fauci...",lou se ce eh 3h ir le es americas frontline do...
1167,FauciLiedPeopleDied,1297833285075251202,./data_Vaccine_hate/FauciLiedPeopleDied/tweets...,"Dr. Fauci allowed 17,000 aids patients to die ...",photo ad 1981 know bs yd io na pr fav une


In [9]:
result_path = './'+ csv_folder_name + '/data_' + topic_name + '_final.csv'
df_csv_cleaned.to_csv(result_path)

In [10]:
df_csv_cleaned['hashtag'].value_counts()

covidiots                711
Plandemic                203
CovidHoax                 82
FauciTheFraud             47
antivaxx                  46
BillGatesBioTerrorist     27
BillGatesIsNotADoctor     19
antivax                   14
CoronaHoax                11
FauciLiedPeopleDied        6
Scandemic                  2
medicalkidnap              1
Name: hashtag, dtype: int64

## Text Detection using Google Cloud Vision API

In [2]:
result_cleaned_path = './csv_data/sample_data.csv'
df_csv_cleaned = pd.read_csv(result_cleaned_path, index_col=0)

In [3]:
df_csv_cleaned

Unnamed: 0_level_0,hashtag,tweet_id,image_path,sample_path,label,body_text,image_text
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Mask,masksoff,1290868734429560833,./data_Mask/masksoff/tweets1/EeoWRcMUcAkQCPv.jpg,./annotation_data/Mask/EeoWRcMUcAkQCPv.jpg,,@realDonaldTrump Is this the correct way to we...,
Vaccine,CovidHoax,1298333011654533121,./data_Vaccine/CovidHoax/tweets4/EgSYL8-UEAAvm...,./annotation_data/Vaccine/EgSYL8-UEAAvmqv.jpg,,#covidHOAX #PLANDEMIC \n\nWhy FORCE vaccines?\...,
AsianHate,ChinaVirus,1298416220341809153,./data_AsianHate/ChinaVirus/tweets4/EgTmq8KVoA...,./annotation_data/AsianHate/EgTmq8KVoAAhT7M.jpg,,"In a “wartime state” of lockdown, residents in...",
Vaccine,COVID19Vaccine,1297957738069110784,./data_Vaccine/COVID19Vaccine/tweets4/EgNFrqDU...,./annotation_data/Vaccine/EgNFrqDU4AAnVOF.jpg,,@briantylercohen @realDonaldTrump needs Russia...,
Mask,NoMasks,1295274000311099392,./data_Mask/NoMasks/tweets3/Efm8eAkWkAAdDdq.jpg,./annotation_data/Mask/Efm8eAkWkAAdDdq.jpg,,@Uber ...you won't be getting my business from...,
...,...,...,...,...,...,...,...
Vaccine,CovidHoax,1289652181981786114,./data_Vaccine/CovidHoax/tweets1/EeXD00OXsAAHT...,./annotation_data/Vaccine/EeXD00OXsAAHTF9.jpg,,As Unemployment Benefits End Today Trump Admin...,
AsianHate,ChinaVirus,1299044347480928257,./data_AsianHate/ChinaVirus/tweets4/Egch8qDVAA...,./annotation_data/AsianHate/Egch8qDVAAEznVX.jpg,,@TheBrandonMorse @shiroihamusan Told you 😀 \n\...,
AsianHate,CCPVirus,1290292534346883077,./data_AsianHate/CCPVirus/tweets1/EegKN1sXsAEv...,./annotation_data/AsianHate/EegKN1sXsAEvi96.jpg,,@realDonaldTrump Chinese communist party won’t...,
Boomer,trumpliesamericansdie,1299009244696645632,./data_Boomer/trumpliesamericansdie/tweets4/Eg...,./annotation_data/Boomer/EgcCBfHWsAA4WM5.jpg,,.@vp Pence should really stop plagiarizing the...,


In [4]:
# !export GOOGLE_APPLICATION_CREDENTIALS="/Users/anooshaseelam/Downloads/TextDetection-4569c75d7f0e.json"
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/anooshaseelam/Downloads/TextDetection-4569c75d7f0e.json"

In [5]:
from test import detect_text
from tqdm import tqdm
pd.options.mode.chained_assignment = None  # default='warn'
tqdm.pandas()

df_csv_cleaned['image_text'] = df_csv_cleaned['sample_path'].progress_apply(detect_text)

100%|██████████| 2400/2400 [43:48<00:00,  1.10s/it] 


In [6]:
df_csv_cleaned

Unnamed: 0_level_0,hashtag,tweet_id,image_path,sample_path,label,body_text,image_text
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Mask,masksoff,1290868734429560833,./data_Mask/masksoff/tweets1/EeoWRcMUcAkQCPv.jpg,./annotation_data/Mask/EeoWRcMUcAkQCPv.jpg,,@realDonaldTrump Is this the correct way to we...,"\n""10:33\nHG\n12K\n17 8,619\n38.4K\nTweet your..."
Vaccine,CovidHoax,1298333011654533121,./data_Vaccine/CovidHoax/tweets4/EgSYL8-UEAAvm...,./annotation_data/Vaccine/EgSYL8-UEAAvmqv.jpg,,#covidHOAX #PLANDEMIC \n\nWhy FORCE vaccines?\...,"\n""What's\nin vaccines?\nWHY DON'T YOU\nOR YOU..."
AsianHate,ChinaVirus,1298416220341809153,./data_AsianHate/ChinaVirus/tweets4/EgTmq8KVoA...,./annotation_data/AsianHate/EgTmq8KVoAAhT7M.jpg,,"In a “wartime state” of lockdown, residents in...","\n""4\n占口\n南1法站口\nBRE\n扫码查询\n候车信息\n汉口站\n"""
Vaccine,COVID19Vaccine,1297957738069110784,./data_Vaccine/COVID19Vaccine/tweets4/EgNFrqDU...,./annotation_data/Vaccine/EgNFrqDU4AAnVOF.jpg,,@briantylercohen @realDonaldTrump needs Russia...,"\n""SERKO\n"""
Mask,NoMasks,1295274000311099392,./data_Mask/NoMasks/tweets3/Efm8eAkWkAAdDdq.jpg,./annotation_data/Mask/Efm8eAkWkAAdDdq.jpg,,@Uber ...you won't be getting my business from...,"\n""No mask, no ride!\nInbox x\nUber <uber@uber..."
...,...,...,...,...,...,...,...
Vaccine,CovidHoax,1289652181981786114,./data_Vaccine/CovidHoax/tweets1/EeXD00OXsAAHT...,./annotation_data/Vaccine/EeXD00OXsAAHTF9.jpg,,As Unemployment Benefits End Today Trump Admin...,"\n""PC LICE\n35\n"""
AsianHate,ChinaVirus,1299044347480928257,./data_AsianHate/ChinaVirus/tweets4/Egch8qDVAA...,./annotation_data/AsianHate/Egch8qDVAAEznVX.jpg,,@TheBrandonMorse @shiroihamusan Told you 😀 \n\...,"\n""BOYCOTT\nMADE IN CHINA\nUNITED NATIONS FOR ..."
AsianHate,CCPVirus,1290292534346883077,./data_AsianHate/CCPVirus/tweets1/EegKN1sXsAEv...,./annotation_data/AsianHate/EegKN1sXsAEvi96.jpg,,@realDonaldTrump Chinese communist party won’t...,"\n""What CCP did to buy time for the world\nNov..."
Boomer,trumpliesamericansdie,1299009244696645632,./data_Boomer/trumpliesamericansdie/tweets4/Eg...,./annotation_data/Boomer/EgcCBfHWsAA4WM5.jpg,,.@vp Pence should really stop plagiarizing the...,"\n""If You Tell A Lie Big Enough And Keep Repea..."


In [7]:
result_cleaned_path = './csv_data/sample_path_copy.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

### Cleaning of the results from Google API

In [1]:
import re
import spacy
import pandas as pd

csv_path = './csv_data/sample_data_copy.csv'
df_csv = pd.read_csv(csv_path, index_col=0)

In [2]:
nlp = spacy.load('en_core_web_md',disable = ['parser','ner']) # remove three tasks so as to speed up the precess 
nlp.max_length = 1100000

def nlp_preprocess(caption):
    doc = nlp(caption)
    lemmanized_list = []
    lemmanized_phrase = ""
    for token in doc:
        if not token.is_punct and not token.is_stop and not token.is_oov: # check is token is not punctutation stop word and in the nlp vocab
            lemmanized_list.append(token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.text) 
    lemmanized_phrase = ' '.join(lemmanized_list)
    return lemmanized_phrase

In [3]:
df_csv['image_text'] = df_csv['image_text'].astype(str)
# remove line breakers
df_csv['image_text'] = df_csv['image_text'].apply(lambda x: x.replace('\n', ' '))
# remove first and last double quotes
df_csv['image_text'] = df_csv['image_text'].apply(lambda x: if not np.isnan(x): x[1:-1])
# remove non-English words
# df_csv['image_text'] = df_csv['image_text'].apply(remove_non_english)

SyntaxError: invalid syntax (<ipython-input-3-d3a84f94baca>, line 5)

In [None]:
# remove line breakers
df_csv['text_with_OCR'] = df_csv['text_with_OCR'].apply(lambda x: x.replace('\n', ' '))
# remove all special characters
# df_csv['text_with_OCR'] = df_csv['text_with_OCR'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x))
# tokenize, remove words that are shorted than two characters ,lemmatizer, removing stopwords, and stemming
df_csv['text_with_OCR'] = df_csv['text_with_OCR'].progress_apply(nlp_preprocess)
# remove any single character
df_csv['text_with_OCR'] = df_csv['text_with_OCR'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))