In [1]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd=r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [6]:
import cv2
from PIL import Image
import os
import pandas as pd
import matplotlib.pyplot as plt
from transformers import pipeline
import re,string

### Extracting text for one meme

In [7]:
image = Image.open(r"memes/2.png")

text=pytesseract.image_to_string(image)
text

"A42sky42 said\n\nMy English teacher says we shouldn't\nrefer to authors by their first names\nbecause they aren't our friends. Will you\nconfirm our friendship and let me call\nyou Neil on my American Gods book\nreport?\n\n(2) neil-gaiman\n\nAbsolutely.\n"

## Functions

The first function, fetch_memes(), takes a directory path as an input and reads in all the images in that directory using the cv2 library. It then extracts text from each image using the pytesseract library and appends the extracted text to a list. Finally, it returns a Pandas DataFrame containing the list of extracted texts.

The second function, standardization(), takes the DataFrame returned by fetch_memes and performs several standardization steps on the text. First, it converts all text to lowercase using the lower() method. Next, it removes all numbers using a regular expression (re.sub(r'\d+', '', x)). It then replaces all newline and tab characters with a space. It also removes any occurrences of .com from the start or end of the text string. Finally, it removes all punctuation using the translate() method and returns the standardized DataFrame.

The third function, label_memes(), takes the standardized DataFrame returned by standardization and adds a new column called labels. For each meme in the DataFrame, it uses a pre-trained sentiment analysis model (jayanta/ProsusAI-finbert-sentiment-finetuned-memes) to classify the meme as either positive or negative. It then appends the label to a list and adds that list as a new column to the DataFrame. The function returns the labeled DataFrame.

In [13]:
def fetch_memes(directory_in_str):
    directory = os.fsencode(directory_in_str)
    list_of_texts=[]
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        img=cv2.imread(os.path.join(os.fsdecode(directory),filename))
        text=pytesseract.image_to_string(img)
        list_of_texts.append(text)

    data = pd.DataFrame(list_of_texts)
    
    return data

        
def standardization(data):
    data = data[0]
    data = data.apply(lambda x: x.lower()) #converting to lowercase
    data = data.apply(lambda x: re.sub(r'\d+', '', x))
    data = data.apply(lambda x: re.sub(r'\n', ' ', x))
    data = data.apply(lambda x: re.sub(r'\t', ' ', x))
    data = data.apply(lambda x: re.sub(r'.com', '', x, flags=re.MULTILINE)) #removing .coms from the start or end of the text string. 
    # flags=re.MULTILINE is a flag that allows the ^ and $ characters to match the start and end of each line
    data = data.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation))) #removes all punctuations
    data = pd.DataFrame(data)
    return data

def label_memes(data):
    labels=[]
    classifier = pipeline('sentiment-analysis',model="jayanta/ProsusAI-finbert-sentiment-finetuned-memes")
    for meme in data[0]:
        l=classifier(meme)[0]['label']
        if l=='0':
            labels.append('Positive')
        else:
            labels.append('Negative')
    
    data['labels']=labels
    return data


### HuggingFace functions

In [None]:
def standard(text):
    x = text.lower()
    x = re.sub(r'\d+', '', x)
    x = re.sub(r'\n', ' ', x)
    x = re.sub(r'\t', ' ', x)
    x = re.sub(r'.com', '', x, flags=re.MULTILINE)
    x = x.translate(str.maketrans('', '', string.punctuation))
    
    return x
    
def predict(text):
    
    output=dict()
    classifier = pipeline('sentiment-analysis',model="jayanta/ProsusAI-finbert-sentiment-finetuned-memes")
    l=classifier(text)[0]['label']
    p=classifier(text)[0]['score']
    
    if l=='0':
        output['Positive']=p
    else:
        output['Negative']=p

    return output

In [14]:
def OCR(directory_in_str):
    data = fetch_memes(directory_in_str)
    df = standardization(data)
    labels = label_memes(df)
    return labels

In [15]:
Result=OCR(r"memes")

In [16]:
classifier = pipeline('sentiment-analysis',model="jayanta/ProsusAI-finbert-sentiment-finetuned-memes")
classifier(text)

[{'label': '0', 'score': 0.9999703168869019}]

## Output

In [17]:
Result

Unnamed: 0,0,labels
0,ratatouille cook me dinner you fucking useles...,Negative
1,asky said my english teacher says we shouldnt...,Positive
2,here is a new challenge for all you bored teen...,Positive
3,took my year old to a classmate’s birthday pa...,Negative
4,my grandpa who knows nothing about video games...,Negative
5,go to the same gas station everyday after wo...,Positive
6,myatose fathers feeding tube before ramadan,Positive
7,javell tm jvizzle d how is it “get over slave...,Negative
8,enough about my racist past,Negative
9,found my clone selling nuts in istanbul we c...,Negative


In [None]:
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
# encodings=tokenizer(Xp,truncation=True,padding=True)