In [112]:
import spacy
import sqlite3
import pandas as pd
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import os
from google.cloud import vision
import io
import string
from nltk.corpus import stopwords
from difflib import SequenceMatcher
from textblob import TextBlob
from fuzzywuzzy import process, fuzz
import re
from skimage.feature import graycomatrix, graycoprops
import pytesseract
import cv2

In [96]:
#Load Stop Words
stop = stopwords.words('english')
with open('selected_words.txt', 'r') as f:
    stop_words = [word.strip().replace('"', '') for line in f.readlines() for word in line.split(',')]
stop.extend(stop_words)

In [249]:
#Detect and Process Text here
def lower_case(df):
    df['text'] = df['text'].apply(str.lower)
    return df

def remove_punctuations(df):
    cleaned_text = []
    for index in tqdm(range(df.shape[0])):
        text = df['text'].iloc[index]

        word_tokens = text.split()
        
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in word_tokens]

        filtered_sentence = " ".join(stripped).strip()
        cleaned_text.append(filtered_sentence)
    df['text'] = np.array(cleaned_text)
    return df

def remove_null(df):
    if df['text'].isnull().sum() > 0:
        df.dropna(inplace = True)
    return df

def detect_text(path):
    """Detects text in the file."""
    
    client = vision.ImageAnnotatorClient()

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.text_detection(image=image)
    texts = response.text_annotations
 
    text_list = []

    for text in texts:
        text_list.append('\n"{}"'.format(text.description))

    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
    
    return text_list

def removeStop(temp_df,stopwords = stop):
    temp_df['text'] = temp_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

def cleanDataframeEnglish(df,column_name):
  cleanedDf = pd.DataFrame(columns=['words'])
  for index, row in df.iterrows():
      word = row[column_name]
      if word.isalnum() and not word.isspace() and word.isascii():
        new_row = pd.DataFrame.from_records([{'words':word}])
        # print("New Row",new_row)
        cleanedDf = pd.concat([cleanedDf,new_row],ignore_index=True)
        # print("Iteration",cleanedDf)

  return cleanedDf

def executeText(path):
    image_text = detect_text(path)
    temp_df = pd.DataFrame(columns=['text'])
    temp_df['text'] = image_text[0].replace("\n"," ").split()
    temp_df = remove_punctuations(temp_df)
    temp_df = lower_case(temp_df)
    temp_df = remove_null(temp_df)
    temp_df['text'] = temp_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    new_df = cleanDataframeEnglish(temp_df, 'text')   
    word_list = new_df.values.flatten().tolist()
    returnDict = {'Word List': word_list}

    return returnDict

In [250]:
word_list = executeText("Accept-SP_Tablet2 (1).jpg")
word_list


100%|██████████| 84/84 [00:00<00:00, 57587.70it/s]


{'Word List': ['acceptsp', '20000']}

In [199]:

def color_moments(image_path):
    img = cv2.imread(image_path)
    channels = cv2.split(img)

    colour_features = []
    for channel in channels:
        moments = cv2.moments(channel)
        for i in range(3):
            for j in range(3):
                if i + j <= 2:
                    colour_features.append(moments['m{}{}'.format(i, j)] / moments['m00'])

    returnDict = {'Colour Features': colour_features}
    return returnDict


def texture_features(image_path):
        # Convert image to grayscale

        img = cv2.imread(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Compute GLCM features
        glcm = graycomatrix(gray, distances=[1], angles=[0], symmetric=True, normed=True)
        contrast = graycoprops(glcm, 'contrast')
        dissimilarity = graycoprops(glcm, 'dissimilarity')
        homogeneity = graycoprops(glcm, 'homogeneity')
        energy = graycoprops(glcm, 'energy')
        correlation = graycoprops(glcm, 'correlation')

        returnDict = {'Contrast': contrast.flatten()[0],
                     'Dissimilarity': dissimilarity.flatten()[0], 
                     'Homogeneity': homogeneity.flatten()[0], 
                     'Energy':energy.flatten()[0], 
                     'Coorelation':correlation.flatten()[0]}
        return returnDict

def shape_features(image_path):
    # Convert image to grayscale
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Threshold the image to create a binary image
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Compute the contours of the binary image
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Compute the area, perimeter, and aspect ratio of each contour
    areas = []
    perimeters = []
    aspect_ratios = []
    centroid_xs = []
    centroid_ys = []
    for contour in contours:
        area = cv2.contourArea(contour)
        perimeter = cv2.arcLength(contour, True)
        x, y, w, h = cv2.boundingRect(contour)
        aspect_ratio = w / h
        centroid_x = x + w/2
        centroid_y = y + h/2
        areas.append(area)
        perimeters.append(perimeter)
        aspect_ratios.append(aspect_ratio)
        centroid_xs.append(centroid_x)
        centroid_ys.append(centroid_y)

    # Compute the mean and standard deviation of the computed features
    mean_area = np.mean(areas)
    std_area = np.std(areas)
    mean_perimeter = np.mean(perimeters)
    std_perimeter = np.std(perimeters)
    mean_aspect_ratio = np.mean(aspect_ratios)
    std_aspect_ratio = np.std(aspect_ratios)
    mean_centroid_x = np.mean(centroid_xs)
    std_centroid_x = np.std(centroid_xs)
    mean_centroid_y = np.mean(centroid_ys)
    std_centroid_y = np.std(centroid_ys)

    returnDict = {
    'Mean Area': mean_area,
    'Std Area': std_area,
    'Mean Perimeter': mean_perimeter,
    'Std Perimeter': std_perimeter,
    'Mean Aspect Ratio': mean_aspect_ratio,
    'Std Aspect Ratio': std_aspect_ratio,
    'Mean Centroid X': mean_centroid_x,
    'Std Centroid X': std_centroid_x,
    'Mean Centroid Y': mean_centroid_y,
    'Std Centroid Y': std_centroid_y
    }   
    # Return the computed features
    return returnDict

def pattern_features(image_path):
    img = cv2.imread(image_path)
    # Convert image to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Compute the normalized histogram of the image
    hist = cv2.calcHist([gray], [0], None, [256], [0, 256])
    hist_norm = hist / np.sum(hist)

    # Compute the entropy of the histogram
    eps = np.finfo(float).eps
    entropy = -np.sum(hist_norm * np.log2(hist_norm + eps))

    returnDict = {'Entropy': entropy}

    # Return the computed feature
    return returnDict





In [196]:
texture = texture_features("Accept-SP_Tablet2 (1).jpg")
print(texture)

{'Contrast': 615.0416337025316, 'Dissimilarity': 9.433023470464136, 'Homogeneity': 0.5019732088818553, 'Energy': 0.13262236915429243, 'Coorelation': 0.839205042376685}


In [251]:
def executeFeatures(image_path):
    wordList = executeText(image_path)
    colour_moments = color_moments(image_path)
    texture = texture_features(image_path)
    shape = shape_features(image_path)
    pattern = pattern_features(image_path)

    returnDict = {'Text' : wordList, 'Color Moments': colour_moments, 'Texture':texture,'Shape':shape, 'Pattern':pattern}

    return returnDict    

In [252]:
executeFeatures("Accept-SP_Tablet2 (1).jpg")

100%|██████████| 84/84 [00:00<00:00, 55431.33it/s]


{'Text': {'Word List': ['acceptsp', '20000']},
 'Color Moments': {'Colour Features': [1.0,
   241.82269660016618,
   77670.99784300446,
   158.35815420887445,
   38165.005853283306,
   34035.683194369456,
   1.0,
   239.73730339309432,
   76878.92557922703,
   158.2292454910792,
   37815.985687643224,
   33985.5850936557,
   1.0,
   242.4240522970985,
   78014.78238623713,
   158.32791008105244,
   38241.26631223746,
   34017.41576882488]},
 'Texture': {'Contrast': 615.0416337025316,
  'Dissimilarity': 9.433023470464136,
  'Homogeneity': 0.5019732088818553,
  'Energy': 0.13262236915429243,
  'Coorelation': 0.839205042376685},
 'Shape': {'Mean Area': 151364.0,
  'Std Area': 0.0,
  'Mean Perimeter': 1590.0,
  'Std Perimeter': 0.0,
  'Mean Aspect Ratio': 0.6604166666666667,
  'Std Aspect Ratio': 0.0,
  'Mean Centroid X': 158.5,
  'Std Centroid X': 0.0,
  'Mean Centroid Y': 240.0,
  'Std Centroid Y': 0.0},
 'Pattern': {'Entropy': 5.4311476}}