In [28]:
# specify the topic name here
topic_name = 'Political'

# entry folder for the data
# should be the form of 'data_{topic_name}/'
entry_folder = 'data_Political/'

In [1]:
import os
import glob
import pandas as pd
from tqdm import tqdm

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
def get_image_path(image_url, csv_path):
    image_name = image_url.split('/')[-1]
    image_path = './' + csv_path[:-4] + '/' + image_name
    return image_path

In [None]:
df_csv = pd.DataFrame(columns=['hashtag', 'tweet_id', 'image_path', 'body_text'])
df_csv['tweet_id'] = df_csv['tweet_id'].astype('int64')

entries = os.listdir(entry_folder)
for hashtag in entries:
    if hashtag.startswith('.'): continue
    list_files = glob.glob(entry_folder + hashtag + '/*.csv')
    tweets_count = 0
    for csv_path in list_files:
        # print(csv_path)
        temp_df = pd.read_csv(csv_path)
        tweets_count += len(temp_df.index)
        temp_df.drop(columns=['tweet_url'], inplace=True)
        temp_df.rename({'text_data': 'body_text', 'media': 'image_path'}, axis='columns', inplace=True)
        temp_df.insert(loc=0, column='hashtag', value=hashtag)
        cols = temp_df.columns.tolist()
        # before: cols = ['hashtag', 'image_path', 'body_text', 'tweet_id']
        cols = cols[:1] + cols[-1:] + cols[1:-1]
        temp_df = temp_df[cols]
        temp_df['image_path'] = temp_df['image_path'].apply(get_image_path, csv_path=csv_path)
        temp_df['tweet_id'] = temp_df['tweet_id'].astype('int64')
        df_csv = df_csv.append(temp_df)
    print('{:>24s}: {:>5}'.format(hashtag, tweets_count))
print('Topic: {} - Total tweets: {:>5}'.format(topic_name, len(df_csv.index)))

In [None]:
# remove tweets with invalid image path
df_csv['path_is_valid'] = df_csv['image_path'].apply(lambda x: 1 if os.path.isfile(x) else 0)
df_csv = df_csv[df_csv.path_is_valid.eq(1)]
df_csv.drop(columns=['path_is_valid'], inplace=True)

In [None]:
df_csv.reset_index(drop=True, inplace=True)
df_csv

### Filter out images without text

In [None]:
from opencv_text_detector import TextDetector

tqdm.pandas()
detector = TextDetector()

# df_csv['has_text'] = df_csv['image_path'].apply(detector.detect_text)
df_csv['has_text'] = df_csv['image_path'].progress_apply(detector.detect_text)

df_csv

In [None]:
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv.to_csv(result_path)

In [None]:
# filter out those images without text
result_path = './csv_data/data_' + topic_name + '.csv'
df_csv = pd.read_csv(result_path, index_col=0)

df_csv_cleaned = df_csv[df_csv.has_text.eq(1)]
df_csv_cleaned.drop(columns=['has_text'], inplace=True)
df_csv_cleaned.reset_index(drop=True, inplace=True)

result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

In [None]:
df_csv_cleaned

In [None]:
result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
df_csv_cleaned = pd.read_csv(result_cleaned_path, index_col=0)

In [None]:
# count the tweets in different hashtags after cleaning
df_csv_cleaned['hashtag'].value_counts()

### Filter out duplicate images

In [None]:
result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned.csv'
cleaned_data = pd.read_csv(result_cleaned_path, index_col=0)
image_paths = cleaned_data['image_path'].to_list()

In [None]:
from remove_duplicates import duplicate_detector
no_duplicate_paths = duplicate_detector(image_paths)

In [None]:
tqdm.pandas()
cleaned_data['no_dups'] = cleaned_data['image_path'].apply(lambda x:1 if x in no_duplicate_paths else 0)
df_csv_cleaned = cleaned_data[cleaned_data.no_dups.eq(1)]
df_csv_cleaned.drop(columns=['no_dups'], inplace=True)
df_csv_cleaned.reset_index(drop=True, inplace=True)

df_csv_cleaned

In [None]:
result_cleaned_path = './csv_data/data_' + topic_name + '_cleaned_nodups.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

In [None]:
df_csv_cleaned['hashtag'].value_counts()

### Filter out images with no objects

In [None]:
csv_path = './csv_data/data_' + topic_name + '_cleaned_nodups.csv'
df_read = pd.read_csv(csv_path, index_col=0)

In [None]:
from yolo_object_detector import ObjectDetector

tqdm.pandas()
detector = ObjectDetector()

df_read['has_object'] = df_read['image_path'].progress_apply(detector.detect_object)

df_read

In [None]:
df_with_obj = df_read[df_read.has_object.eq(1)]
df_with_obj.drop(columns=['has_object'], inplace=True)
df_with_obj.reset_index(drop=True, inplace=True)

result_cleaned_path = './csv_data/data_' + topic_name + '_nodups_wobj.csv'
df_with_obj.to_csv(result_cleaned_path)

In [None]:
# count the tweets in different hashtags after cleaning
df_with_obj['hashtag'].value_counts()

In [None]:
df_with_obj

### Extracting text from memes

In [None]:
import pandas as pd
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import cv2
import pytesseract
custom_config = r"--oem 3 --psm 11 -c tessedit_char_whitelist= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ '"
pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'


result_cleaned_path = './csv_data/data_' + topic_name + '_nodups_wobj.csv'
df_csv_cleaned = pd.read_csv(result_cleaned_path, index_col=0)

In [None]:
# image_paths = df_csv_cleaned['image_path'].to_list()

In [None]:
from tesseract_ocr import TextRecognition
from tqdm import tqdm
pd.options.mode.chained_assignment = None  # default='warn'
tqdm.pandas()

df_csv_cleaned['text_with_OCR'] = df_csv_cleaned['image_path'].progress_apply(TextRecognition)

In [None]:
result_cleaned_path = './csv_data/data_' + topic_name + '_nodups_wobj_textOCR.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

### Text Extraction from Images

In [3]:
import pandas as pd
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import cv2
import pytesseract
custom_config = r"--oem 3 --psm 11 -c tessedit_char_whitelist= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ '"
pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'
result_cleaned_path = './csv_data/data_' + topic_name + '_nodups_wobj.csv'
df_csv_cleaned = pd.read_csv(result_cleaned_path, index_col=0)

In [4]:
df_csv_cleaned

Unnamed: 0,hashtag,tweet_id,image_path,body_text
0,WearAMask,1291524286168014849,./data_Mask/WearAMask/tweets1/EexqAi4UMAA8-pY.jpg,Important to keep a brighter future in mind du...
1,WearAMask,1291516154238967809,./data_Mask/WearAMask/tweets1/EexjF6cWAAMZrVW.jpg,I unanimously declare this is the 2020 uniform...
2,WearAMask,1291515784657698816,./data_Mask/WearAMask/tweets1/EexiweaUEAE_sTp.jpg,"As seen in Sunnyside, Calgary. #WearAMask #yyc..."
3,WearAMask,1291512744366268417,./data_Mask/WearAMask/tweets1/EexerTzX0AAjMdm.jpg,Actor and entertainer @KnoxSkyy sporting a Adi...
4,WearAMask,1291518463144583170,./data_Mask/WearAMask/tweets1/EexlMcWWkAAMTUs.jpg,Public health peeps- let’s make sure we are sh...
...,...,...,...,...
20144,masks,1299496989294563328,./data_Mask/masks/tweets5/Egg3URXU8AEHaBf.jpg,Masks Do Protect the Wearer: Breathing in Less...
20145,masks,1299515063167062016,./data_Mask/masks/tweets5/EgjOC4WWAAMsvDN.jpg,All masks and phones #beijingsubway #subway #M...
20146,masks,1299525164841996289,./data_Mask/masks/tweets5/EgjXPsXWoAEHd8G.jpg,@TIME We're not going to count the number of m...
20147,masks,1299521504661639168,./data_Mask/masks/tweets5/EgjSl_5WsAERsCf.jpg,"Mr. Erin O'Toole @ErinOTooleMP, where are the ..."


In [5]:
from tesseract_ocr import TextRecognition
from tqdm import tqdm
pd.options.mode.chained_assignment = None  # default='warn'
tqdm.pandas()

df_csv_cleaned['text_with_OCR'] = df_csv_cleaned['image_path'].progress_apply(TextRecognition)
df_csv_cleaned['text_with_OCR'] = df_csv_cleaned['text_with_OCR'].apply(lambda x:x.replace('\n', ' '))

100%|██████████| 20149/20149 [2:42:31<00:00,  2.07it/s]  


In [6]:
result_cleaned_path = './csv_data/data_' + topic_name + '_nodups_wobj_textOCR.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

In [7]:
df_csv_cleaned

Unnamed: 0,hashtag,tweet_id,image_path,body_text,text_with_OCR
0,WearAMask,1291524286168014849,./data_Mask/WearAMask/tweets1/EexqAi4UMAA8-pY.jpg,Important to keep a brighter future in mind du...,"aS Pa, AlumGrow (ect e:) WN Be i “Look forw..."
1,WearAMask,1291516154238967809,./data_Mask/WearAMask/tweets1/EexjF6cWAAMZrVW.jpg,I unanimously declare this is the 2020 uniform...,pee fp Le . i eaiiaad fp i / Fa
2,WearAMask,1291515784657698816,./data_Mask/WearAMask/tweets1/EexiweaUEAE_sTp.jpg,"As seen in Sunnyside, Calgary. #WearAMask #yyc...",‘ \S \ ALL HEROES a aa \ kaihero. Protec...
3,WearAMask,1291512744366268417,./data_Mask/WearAMask/tweets1/EexerTzX0AAjMdm.jpg,Actor and entertainer @KnoxSkyy sporting a Adi...,\ sg > nba | adidas 4 i
4,WearAMask,1291518463144583170,./data_Mask/WearAMask/tweets1/EexlMcWWkAAMTUs.jpg,Public health peeps- let’s make sure we are sh...,se | - fl tove y
...,...,...,...,...,...
20144,masks,1299496989294563328,./data_Mask/masks/tweets5/Egg3URXU8AEHaBf.jpg,Masks Do Protect the Wearer: Breathing in Less...,fo} j Py |
20145,masks,1299515063167062016,./data_Mask/masks/tweets5/EgjOC4WWAAMsvDN.jpg,All masks and phones #beijingsubway #subway #M...,—— . yy a a Sy G Uy ag a cor bs ty...
20146,masks,1299525164841996289,./data_Mask/masks/tweets5/EgjXPsXWoAEHd8G.jpg,@TIME We're not going to count the number of m...,ti S FINE
20147,masks,1299521504661639168,./data_Mask/masks/tweets5/EgjSl_5WsAERsCf.jpg,"Mr. Erin O'Toole @ErinOTooleMP, where are the ...","“y= omg. > we J SANS MASQU SANS eh i"" ..."


### Text cleaning

In [30]:
result_cleaned_path = './csv_data/data_' + topic_name + '_nodups_wobj_textOCR.csv'
df_csv_cleaned = pd.read_csv(result_cleaned_path, index_col=0)

In [31]:
import re
# re.sub('[^A-Za-z0-9]+', '', string)
df_csv_cleaned['text_with_OCR'] = df_csv_cleaned['text_with_OCR'].apply(lambda x:x.replace('\n', ' '))
df_csv_cleaned['image_text_cleaned'] = df_csv_cleaned['text_with_OCR'].apply(lambda x:re.sub('[^A-Za-z0-9]+', ' ', x))

In [32]:
df_csv_cleaned

Unnamed: 0,hashtag,tweet_id,image_path,body_text,text_with_OCR,image_text_cleaned
0,trumpliesaboutcoronavirus,1290340052913000448,./data_Political/trumpliesaboutcoronavirus/twe...,"FUCK TRUMP! FUCKING CLOWN, IDIOT, MURDERER!!! ...",* 3 a ae et Ri Ni a mM ~ AFOOT:LIAR ...,3 a ae et Ri Ni a mM AFOOT LIAR imgflip com A...
1,trumpliesaboutcoronavirus,1291377414530985991,./data_Political/trumpliesaboutcoronavirus/twe...,Just followed you! #TheResistance #TrumpIsNotW...,"isn't the real disgrace, that a major American...",isn t the real disgrace that a major American ...
2,trumpliesaboutcoronavirus,1291125570651095046,./data_Political/trumpliesaboutcoronavirus/twe...,RT for RT\nhttps://t.co/hwYlWiaYyB\n98% OFF wi...,WORK AT HOME 7 JOHN CRES NI,WORK AT HOME 7 JOHN CRES NI
3,trumpliesaboutcoronavirus,1292117802241859584,./data_Political/trumpliesaboutcoronavirus/twe...,Then why is he in court trying to destroy the ...,FAKE PRESIDENT ¥ ; r REAL ASSHULE Hard to ...,FAKE PRESIDENT r REAL ASSHULE Hard to argue wi...
4,trumpliesaboutcoronavirus,1292118364463267840,./data_Political/trumpliesaboutcoronavirus/twe...,It's big news if the Administration DROPS ITS ...,"Lon after , thi ron Te eal ea rus eg om :...",Lon after thi ron Te eal ea rus eg om t ie my ...
...,...,...,...,...,...,...
29658,trumpvirus,1299530434863394819,./data_Political/trumpvirus/tweets5/EgjcCw-WkA...,@realDonaldTrump @realDonaldTrump FAILED AMERI...,iL i A,iL i A
29659,trumpvirus,1299527446987898888,./data_Political/trumpvirus/tweets5/EgjZJ85U4A...,@DonaldJTrumpJr @RandPaul It was supposed to b...,DONALD TRUMP a Stochastic Ter. St:,DONALD TRUMP a Stochastic Ter St
29660,trumpvirus,1299498840299626498,./data_Political/trumpvirus/tweets5/Egi_TOEWAA...,Everything about #TrumpsAmerica #RNCConvention...,4 « Happy Han res!,4 Happy Han res
29661,trumpvirus,1299512686607101960,./data_Political/trumpvirus/tweets5/EgjLxF5UwA...,Celebrity Deadpool #RNC2020Convention #TrumpVi...,“% ten * Lo > OS * ‘> y ao “B® ae oN...,ten Lo OS y ao B ae oN 7 x o Xx KA ye a 10 10...


In [33]:
result_cleaned_path = './csv_data/data_' + topic_name + '_nodups_wobj_textOCR.csv'
df_csv_cleaned.to_csv(result_cleaned_path)

### Text Detection using Google Cloud Vision API

In [2]:
result_cleaned_path = './csv_data/sample_data.csv'
df_csv_cleaned = pd.read_csv(result_cleaned_path, index_col=0)

In [3]:
df_csv_cleaned

Unnamed: 0_level_0,hashtag,tweet_id,image_path,sample_path,label,body_text,image_text
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Mask,masksoff,1290868734429560833,./data_Mask/masksoff/tweets1/EeoWRcMUcAkQCPv.jpg,./annotation_data/Mask/EeoWRcMUcAkQCPv.jpg,,@realDonaldTrump Is this the correct way to we...,
Vaccine,CovidHoax,1298333011654533121,./data_Vaccine/CovidHoax/tweets4/EgSYL8-UEAAvm...,./annotation_data/Vaccine/EgSYL8-UEAAvmqv.jpg,,#covidHOAX #PLANDEMIC \n\nWhy FORCE vaccines?\...,
AsianHate,ChinaVirus,1298416220341809153,./data_AsianHate/ChinaVirus/tweets4/EgTmq8KVoA...,./annotation_data/AsianHate/EgTmq8KVoAAhT7M.jpg,,"In a “wartime state” of lockdown, residents in...",
Vaccine,COVID19Vaccine,1297957738069110784,./data_Vaccine/COVID19Vaccine/tweets4/EgNFrqDU...,./annotation_data/Vaccine/EgNFrqDU4AAnVOF.jpg,,@briantylercohen @realDonaldTrump needs Russia...,
Mask,NoMasks,1295274000311099392,./data_Mask/NoMasks/tweets3/Efm8eAkWkAAdDdq.jpg,./annotation_data/Mask/Efm8eAkWkAAdDdq.jpg,,@Uber ...you won't be getting my business from...,
...,...,...,...,...,...,...,...
Vaccine,CovidHoax,1289652181981786114,./data_Vaccine/CovidHoax/tweets1/EeXD00OXsAAHT...,./annotation_data/Vaccine/EeXD00OXsAAHTF9.jpg,,As Unemployment Benefits End Today Trump Admin...,
AsianHate,ChinaVirus,1299044347480928257,./data_AsianHate/ChinaVirus/tweets4/Egch8qDVAA...,./annotation_data/AsianHate/Egch8qDVAAEznVX.jpg,,@TheBrandonMorse @shiroihamusan Told you 😀 \n\...,
AsianHate,CCPVirus,1290292534346883077,./data_AsianHate/CCPVirus/tweets1/EegKN1sXsAEv...,./annotation_data/AsianHate/EegKN1sXsAEvi96.jpg,,@realDonaldTrump Chinese communist party won’t...,
Boomer,trumpliesamericansdie,1299009244696645632,./data_Boomer/trumpliesamericansdie/tweets4/Eg...,./annotation_data/Boomer/EgcCBfHWsAA4WM5.jpg,,.@vp Pence should really stop plagiarizing the...,


In [4]:
# !export GOOGLE_APPLICATION_CREDENTIALS="/Users/anooshaseelam/Downloads/TextDetection-4569c75d7f0e.json"
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/anooshaseelam/Downloads/TextDetection-4569c75d7f0e.json"

In [5]:
from test import detect_text
from tqdm import tqdm
pd.options.mode.chained_assignment = None  # default='warn'
tqdm.pandas()

df_csv_cleaned['image_text'] = df_csv_cleaned['sample_path'].progress_apply(detect_text)

100%|██████████| 2400/2400 [43:48<00:00,  1.10s/it] 


In [6]:
df_csv_cleaned

Unnamed: 0_level_0,hashtag,tweet_id,image_path,sample_path,label,body_text,image_text
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Mask,masksoff,1290868734429560833,./data_Mask/masksoff/tweets1/EeoWRcMUcAkQCPv.jpg,./annotation_data/Mask/EeoWRcMUcAkQCPv.jpg,,@realDonaldTrump Is this the correct way to we...,"\n""10:33\nHG\n12K\n17 8,619\n38.4K\nTweet your..."
Vaccine,CovidHoax,1298333011654533121,./data_Vaccine/CovidHoax/tweets4/EgSYL8-UEAAvm...,./annotation_data/Vaccine/EgSYL8-UEAAvmqv.jpg,,#covidHOAX #PLANDEMIC \n\nWhy FORCE vaccines?\...,"\n""What's\nin vaccines?\nWHY DON'T YOU\nOR YOU..."
AsianHate,ChinaVirus,1298416220341809153,./data_AsianHate/ChinaVirus/tweets4/EgTmq8KVoA...,./annotation_data/AsianHate/EgTmq8KVoAAhT7M.jpg,,"In a “wartime state” of lockdown, residents in...","\n""4\n占口\n南1法站口\nBRE\n扫码查询\n候车信息\n汉口站\n"""
Vaccine,COVID19Vaccine,1297957738069110784,./data_Vaccine/COVID19Vaccine/tweets4/EgNFrqDU...,./annotation_data/Vaccine/EgNFrqDU4AAnVOF.jpg,,@briantylercohen @realDonaldTrump needs Russia...,"\n""SERKO\n"""
Mask,NoMasks,1295274000311099392,./data_Mask/NoMasks/tweets3/Efm8eAkWkAAdDdq.jpg,./annotation_data/Mask/Efm8eAkWkAAdDdq.jpg,,@Uber ...you won't be getting my business from...,"\n""No mask, no ride!\nInbox x\nUber <uber@uber..."
...,...,...,...,...,...,...,...
Vaccine,CovidHoax,1289652181981786114,./data_Vaccine/CovidHoax/tweets1/EeXD00OXsAAHT...,./annotation_data/Vaccine/EeXD00OXsAAHTF9.jpg,,As Unemployment Benefits End Today Trump Admin...,"\n""PC LICE\n35\n"""
AsianHate,ChinaVirus,1299044347480928257,./data_AsianHate/ChinaVirus/tweets4/Egch8qDVAA...,./annotation_data/AsianHate/Egch8qDVAAEznVX.jpg,,@TheBrandonMorse @shiroihamusan Told you 😀 \n\...,"\n""BOYCOTT\nMADE IN CHINA\nUNITED NATIONS FOR ..."
AsianHate,CCPVirus,1290292534346883077,./data_AsianHate/CCPVirus/tweets1/EegKN1sXsAEv...,./annotation_data/AsianHate/EegKN1sXsAEvi96.jpg,,@realDonaldTrump Chinese communist party won’t...,"\n""What CCP did to buy time for the world\nNov..."
Boomer,trumpliesamericansdie,1299009244696645632,./data_Boomer/trumpliesamericansdie/tweets4/Eg...,./annotation_data/Boomer/EgcCBfHWsAA4WM5.jpg,,.@vp Pence should really stop plagiarizing the...,"\n""If You Tell A Lie Big Enough And Keep Repea..."


In [7]:
result_cleaned_path = './csv_data/sample_path_copy.csv'
df_csv_cleaned.to_csv(result_cleaned_path)