## Create Dict containing Afile data with urls

In [None]:
import boto3
import pandas as pd
import requests

# import cv2
import urllib.request
from urllib.request import urlretrieve
import matplotlib.pyplot as plt

'''
Goals  - Done
1. iterate through index.json
  1. just iterate through the keys
  2. (full url contains the whole thing)
2. check image downloads and see if you're able to visualize the images
3. try running a prediction run to see if its working properly

'''

index_url = "https://nyu-dss.github.io/aperitiiif-batch-migrants-state/index.json"
resp = requests.get(index_url).json()


'''
lets create a structure:
  a-file id
    page
      full_url
      pageno
'''
aline_dict = dict()

for page in resp:
  info = page['id'].split('_')
  if info[0] not in aline_dict:
    aline_dict[info[0]] = dict(pages=[])
    if 'redacted' in aline_dict[info[0]] and len(info) == 2:
      print("looks like pdf has redacted and non redacted",info)
    if len(info) == 3 and info[1] == 'redacted':
      aline_dict[info[0]]['redacted'] = True
    else:
      aline_dict[info[0]]['redacted'] = False
  pno = int(info[-1])
  if pno in aline_dict[info[0]]:
    print("Pno present",info)
  else:
    aline_dict[info[0]]['pages'].append({'pno' : pno, 'full_url':page['full_url']})

index_list = list()
for aline in aline_dict.keys():
  redacted = aline_dict[aline]['redacted']
  for page in aline_dict[aline]['pages']:
    index_list.append({'redacted':redacted,'pno':page['pno'],'full_url':page['full_url'],'afile':aline})

len(index_list)



### OCR WRiting Code with Textract

In [None]:
textract = boto3.client('textract', region_name='us-east-1')
s3 = boto3.client('s3')

# Your S3 bucket name
bucket_name = 'mats-datadump'


In [None]:
import json
import os 

def upload_image_to_s3(image_url):
    # Download the image
    response = requests.get(image_url)
    image_key = image_url.split('/')[5]  # Assume the image file name is the last segment of the URL

    # Upload to S3
    s3.put_object(Bucket=bucket_name, Key=image_key, Body=response.content)
    return image_key

def delete_image_from_s3(bucket_name, object_key):
    s3 = boto3.client('s3')
    s3.delete_object(Bucket=bucket_name, Key=object_key)
    # print(f"Deleted {object_key} from {bucket_name}")

def dump_json(file_url, json_file):
    savepath = os.getcwd() + f"/ocr_dump/{file_url.split('/')[5]}_textract.json"
    with open(savepath, 'w') as outfile:
        json.dump(json_file, outfile)
    return savepath


def invoke_textract(image_key):
    response = textract.detect_document_text(
        Document={'S3Object': {'Bucket': bucket_name, 'Name': image_key}}
    )
    # response = textract.analyze_document(
    #     Document={'S3Object': {'Bucket': bucket_name, 'Name': image_key},
    #     },FeatureTypes=[
    #     'TABLES','FORMS','SIGNATURES','LAYOUT',
    # ]
    # )
    return response 


def detect_text_only(response):
    # Extract text from blocks
    text_blocks = [block['Text'] for block in response['Blocks'] if block['BlockType'] == 'LINE']
    detected_text = ' '.join(text_blocks)
    return detected_text


def detect_text_from_s3(image_key, text_only = False):
    # Call DetectDocumentText API
    response = invoke_textract(image_key = image_key)

    # Extract text and its position from blocks
    text_data = []
    detected_text = list()
    for block in response['Blocks']:
        if block['BlockType'] == 'LINE':
            text = block['Text']
            detected_text.append(text)
            if not text_only:
                bounding_box = block.get('Geometry', {}).get('BoundingBox', {})
                polygon = block.get('Geometry', {}).get('Polygon', [])
                text_data.append({
                    'Text': text,
                    'BoundingBox': bounding_box,
                    'Polygon': polygon
                })

    return ' '.join(detected_text), text_data, response

def invoke_textract_on_url(image_url):
    '''
    1. gets image url
    2. uploads to s3
    3. invokes textract and saves the results locally
    4. deletes files from s3 (for storage concerns)
    '''

    image_key = upload_image_to_s3(image_url)
    detected_text, text_coords, response = detect_text_from_s3(image_key=image_key, text_only=False)
    delete_image_from_s3(bucket_name, image_key)
    return detected_text, text_coords, response



In [None]:
import time 
from tqdm import tqdm

from joblib import Parallel, delayed
import time
# df = list()

# for i in tqdm(range(20)):
#     url = index_list[i]['full_url']
#     detected_text, text_coords, response = invoke_textract_on_url(url)
#     savepath = dump_json(url, response)
#     df.append([index_list[i]['afile'], index_list[i]['pno'], index_list[i]['redacted'], index_list[i]['full_url'], detected_text, savepath])

# df = pd.DataFrame(df, columns = ['Afile', 'Page', 'Is_Redacted', 'url', 'Detected Text', 'Ocr_Dump_path'])

# print(f"Time taken {time.time() - st}")


def invoke_ocr_serial(i):
    try:
        time.sleep(0.3)
        url = index_list[i]['full_url']
        detected_text, text_coords, response = invoke_textract_on_url(url)
        savepath = dump_json(url, response)
        return [i, index_list[i]['afile'], index_list[i]['pno'], index_list[i]['redacted'], index_list[i]['full_url'], detected_text, savepath]
    except Exception as e:
        print(i, "Skipped over error",e)
        return [i, index_list[i]['afile'], index_list[i]['pno'], index_list[i]['redacted'], index_list[i]['full_url'], None, None]
    
for st in range(1000, len(index_list), 1000):
    start = time.time() 
    ed = min(len(index_list),st+1000)
    print("Running OCR FOR:",st,ed)
    results = Parallel(n_jobs=20, backend='threading')(delayed(invoke_ocr_serial)(i) for i in range(st,ed))
    df = pd.DataFrame(results, columns = ['idx', 'Afile', 'Page', 'Is_Redacted', 'url', 'Detected Text', 'Ocr_Dump_path'])

    print("Time taken",time.time() - start)
    df.to_csv(f"ocr_dump_{st}_{ed}.csv",index=False)



Concatenating the results

In [None]:
import pandas as pd
import os
from pathlib import Path


import re
def extract_number(filename):
    match = re.search(r'ocr_dump_(\d+)_', filename)
    return int(match.group(1)) if match else None


csv_files = []
for f in os.listdir():
    if 'ocr_dump_' in f:
        csv_files.append(os.getcwd() + f'/{f}')


# Sort filenames by the extracted number
sorted_filenames = sorted(csv_files, key=extract_number)
sorted_filenames


# Set the path where the CSV files are located
path_to_csv_files = Path("./")

# List all CSV files in the directory
# csv_files = list(path_to_csv_files.glob('ocr_dump_*.csv'))

# Read and concatenate all CSV files into one dataframe
combined_csv = pd.concat([pd.read_csv(f) for f in sorted_filenames])

# Save the combined CSV to a single file
combined_csv_path = path_to_csv_files / "combined_csv.csv"
combined_csv.to_csv(combined_csv_path, index=False)

# Provide the path to the saved combined CSV file
combined_csv_path

### Plotting Script

In [None]:
import numpy as np
import pandas as pd
import time
from pathlib import Path
import sys
# from torchvision import transforms
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, f1_score, recall_score

import os
import itertools
import shutil
import PIL
import torch
from torch import nn
# from torchvision import transforms
# from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import PIL.Image as Image
import time
from io import BytesIO
import pickle
from tqdm import tqdm
import random
import time
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import PIL
from sklearn.neighbors import NearestNeighbors
import glob
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from matplotlib.cbook import get_sample_data
from sklearn.preprocessing import LabelEncoder
from argparse import ArgumentParser


def download_preprocess_image(url):
    image = None
    try:
        response = requests.get(url, timeout=5)  # Set timeout as per your requirement
        response.raise_for_status()  # This will raise an exception for HTTP errors
         # If the request was successful, proceed with processing the image
        image = Image.open(BytesIO(response.content))
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print("Retrying")# Handle HTTP errors like 502

        time.sleep(10)
        response = requests.get(url, timeout=5)  # Set timeout as per your requirement
        response.raise_for_status()  # This will raise an exception for HTTP errors
         # If the request was successful, proceed with processing the image
        image = Image.open(BytesIO(response.content))
        return image
    except Exception as err:
        print(f"An error occurred: {err}")  # Handle other errors like timeouts
        return None
    return image

def get_matrix(MODEL_PATH, DATA_PATH):

    return embeddings,filenames


class TSNE_visualiser: 
    def __init__(self, feature_list, filenames):
        # '''
        # params:
        # feature_list : Embeddings list
        # filenames: filenames for the images in the embeddings list
        # '''
        self.feature_list = feature_list
        self.filenames = filenames
        
    # return train_data_reshaped, labels, feature_list

    def fit_tsne(self, perplexity= 30, n_jobs= 4):
    # '''
    # Fits TSNE for the input embeddings
    # feature_list: ssl embeddings
    # perplexity : hyperparameter that determines how many images are close to each other in a cluster
    # n_jobs : number of jobs to be run concurrently. 
    # '''
        n_components = 2
        verbose = 1
        perplexity = perplexity
        n_iter = 1000
        metric = 'euclidean'
        n_jobs= n_jobs

        time_start = time.time()
        tsne_results = TSNE(n_components=n_components,
                            verbose=verbose,
                            perplexity=perplexity,
                            n_iter=n_iter,
                            n_jobs= n_jobs,
                            metric=metric).fit_transform(self.feature_list)

        print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))
        return tsne_results
    
    def scatter_plot(self, tsne_results):
    # '''
    # Plots a scatter plot for the given TSNE fit variable
    # '''
        # le = LabelEncoder()
        # class_labels = le.fit_transform(labels)
        color_map = plt.cm.get_cmap('tab20b_r')
        scatter_plot = plt.scatter(tsne_results[:, 0],
                                tsne_results[:, 1],
                                # c=class_labels,
                                cmap=color_map)
        
        plt.colorbar(scatter_plot)
        plt.title('TSNE of Embeddings');
        fname = './TSNE_Scatter.png'
        plt.savefig(fname)

    def plot_images_in_2d(self, x, y, image_vectors, axis=None, zoom=1):
    # '''
    # Helper function, do not call. 
    # params:
    # x, y : TSNE variables
    # image_vectors: images in the dataset
    # '''
        if axis is None:
            axis = plt.gca()
        x, y = np.atleast_1d(x, y)
        for x0, y0, image_path in zip(x, y, image_vectors):
            image = download_preprocess_image(index_list[image_path]['full_url'])
            image.thumbnail((300, 300), Image.LANCZOS)
            img = OffsetImage(image, zoom=zoom)
            anno_box = AnnotationBbox(img, (x0, y0),
                                    xycoords='data',
                                    frameon=False)
            axis.add_artist(anno_box)
        axis.update_datalim(np.column_stack([x, y]))
        axis.autoscale()

    def show_tsne(self, x, y, images):
      
      fig, axis = plt.subplots()
      fig.set_size_inches(22, 22, forward=True)
      self.plot_images_in_2d(x, y, images, zoom=0.3, axis=axis)
      fname = './TSNE_regplot1.png'
      plt.savefig(fname)

    def tsne_to_grid_plotter_manual(self, x, y, selected_filenames):
      # '''
      # TSNE visualiser with evenly spaced out images
      # params:
      # x, y : TSNE variables
      # selected_filenames: images in the dataset
      # '''
          S = 2000
          s = 100
          x = (x - min(x)) / (max(x) - min(x))
          y = (y - min(y)) / (max(y) - min(y))
          x_values = []
          y_values = []
          filename_plot = []
          x_y_dict = {}
          for i, image_path in enumerate(selected_filenames):
              a = np.ceil(x[i] * (S - s))
              b = np.ceil(y[i] * (S - s))
              a = int(a - np.mod(a, s))
              b = int(b - np.mod(b, s))
              if str(a) + "|" + str(b) in x_y_dict:
                  continue
              x_y_dict[str(a) + "|" + str(b)] = 1
              x_values.append(a)
              y_values.append(b)
              filename_plot.append(image_path)
          fig, axis = plt.subplots()
          fig.set_size_inches(50, 50, forward=True)
          self.plot_images_in_2d(x_values, y_values, filename_plot, zoom=.58, axis=axis)
          fname = './TSNE_GridPlot.png'
          plt.savefig(fname)

Plotting TSNE

In [None]:
tsne = TSNE_visualiser(embeddings[:1000], [i for i in range(len(embeddings[:1000]))])
result = tsne.fit_tsne()

tsne.show_tsne(result[:, 0], result[:, 1], tsne.filenames)

Grid Plot

In [None]:
tsne.tsne_to_grid_plotter_manual(result[:, 0], result[:, 1], tsne.filenames)

### Image Retrieval

In [None]:
'''
output logic
  1. take an image
  2. retrieve n similar images
  3. run it through model and plot those images side by side

'''
def query_faiss(index, query_vector,k,dims):
    query_vector = query_vector.reshape(1,dims)
    distances, indices = index.search(query_vector, k)

    print("Indices of nearest neighbors:", indices)
    print("Distances of nearest neighbors:", distances)
    return distances, indices

def plot_images(indices,source_idx=None):
    overlaps = list()
    # for idx in indices[0]:
    #   image = download_preprocess_image(indexed_urls[idx]['full_url'])
    #   _, _, _, overlap, _ = predict_docufcn(image)
    #   overlaps.append(overlap)
    if source_idx is None:
        fig, axs = plt.subplots(1, len(indices[0]), figsize=(30, 30))
    else:
        fig, axs = plt.subplots(1, len(indices[0])+1, figsize=(30, 30))
        axs[0].imshow(download_preprocess_image(index_list[source_idx]['full_url']))
        axs[0].set_title(f"Source Image")
    for i in range(len(indices[0])):
        axs[i+1].imshow(download_preprocess_image(index_list[indices[0][i]]['full_url']))
        axs[i+1].set_title(f"Retrieved Image {i+1}")

    for ax in axs:
      ax.axis("off")

    plt.show()



In [None]:
vectors = embeddings
reference_shape = vectors[0].shape

# Initialize a list to store the indices of elements with different shapes
different_shape_indices = []
for i, arr in enumerate(vectors[1:], start=1):
    # Compare the shape of the current element with the reference shape
    if arr is None:
        different_shape_indices.append(i)

print(different_shape_indices)

In [None]:
import faiss
import numpy as np
dims = 512

def create_l2_index(dims,vectors):
    index = faiss.IndexFlatL2(dims)  # Using a flat (brute-force) index

    index.add(np.array(vectors))

    query_vector = vectors[3].reshape(1,dims)
    k = 5
    distances, indices = index.search(query_vector, k)

    print("Indices of nearest neighbors:", indices)
    print("Distances of nearest neighbors:", distances)
    
    return index


def normalize_vectors(vectors):
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / np.maximum(norms, 1e-6)


def create_cosine_index(dims, vectors):
    index = faiss.IndexFlatIP(dims)  # 'IP' stands for Inner Product

    # Add vectors to the index
    index.add(np.array(vectors))
    return index



Write as CSV To visualize

In [None]:
def add_to_csv(indices, source_idx=None):
    return [index_list[indices[0][i]]['full_url'] for i in range(len(indices[0]))]

outputcsv = list()
for i in range(len(vectors_cosine[:1000])):
    distances, indices = query_faiss(index_cosine, vectors[i],6,dims)
    outputcsv.append(add_to_csv(indices))
outputcsv = pd.DataFrame(outputcsv,columns=["Source"]+[f"Retrieved Image {i}" for i in range(1,len(indices[0]))])
outputcsv.to_csv('textract_retrieval.csv',index=False)

Plot as Images

In [None]:
dims = vectors[0].shape[0]
vectors_cosine = normalize_vectors(vectors)
index_cosine = create_cosine_index(dims,vectors)
for i in range(len(vectors_cosine)):
    distances, indices = query_faiss(index_cosine, vectors[i],6,dims)
    plot_images(indices,i)
    if i == 50:
        break