# Imports

In [None]:
import requests
import networkx as nx
import time
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
import fitz  # PyMuPDF
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
import io
import hashlib


# Data loading and preprocessing

In [None]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using PyMuPDF (fitz).
    """
    try:
        pdf_document = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""


In [None]:
def extract_images_from_pdf(pdf_path, min_size=(50, 50), min_aspect_ratio=0.5, max_aspect_ratio=2.0):
    """
    Extracts images from a PDF file using PyMuPDF (fitz) and returns a list of unique images.
    Filters out small and irrelevant images based on size and aspect ratio.
    """
    try:
        pdf_document = fitz.open(pdf_path)
        images = []
        image_hashes = set()
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            image_list = page.get_images(full=True)
            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = pdf_document.extract_image(xref)
                image_bytes = base_image["image"]
                image = Image.open(io.BytesIO(image_bytes)).convert("RGB")  # Ensure image is in RGB mode
                
                # Calculate hash of the image
                image_hash = hashlib.md5(image.tobytes()).hexdigest()
                
                # Filter images based on size, aspect ratio, and uniqueness
                if image_hash not in image_hashes:
                    width, height = image.size
                    aspect_ratio = width / height
                    if (width >= min_size[0] and height >= min_size[1] and 
                        min_aspect_ratio <= aspect_ratio <= max_aspect_ratio):
                        images.append(image)
                        image_hashes.add(image_hash)
        return images
    except Exception as e:
        print(f"Error extracting images from {pdf_path}: {e}")
        return []

In [None]:
def extract_features_from_images(images, model, transform):
    """
    Extracts features from a list of images using a pre-trained ResNet model.
    """
    features = []
    for image in images:
        image = transform(image).unsqueeze(0)
        with torch.no_grad():
            feature = model(image)
        features.append(feature.squeeze().numpy())
    if features:
        return np.mean(features, axis=0)
    else:
        return np.zeros((2048,))

In [None]:
def process_pdf_files(base_path, model, transform):
    df = pd.DataFrame(columns=['paper_text', 'slide_text', 'paper_image_features', 'slide_image_features', 'paper_name', 'slide_name'])

    for folder_num in tqdm(range(4984)):
        folder_path = os.path.join(base_path, str(folder_num))
        slide_name = None
        paper_name = None

        for file_name in os.listdir(folder_path):
            if file_name.endswith(".pdf"):
                if "slide" in file_name or "Slide" in file_name:
                    slide_name = file_name
                else:
                    paper_name = file_name

        slide_pdf_path = os.path.join(folder_path, slide_name)
        paper_pdf_path = os.path.join(folder_path, paper_name)

        slide_text = extract_text_from_pdf(slide_pdf_path)
        paper_text = extract_text_from_pdf(paper_pdf_path)

        slide_images = extract_images_from_pdf(slide_pdf_path)
        paper_images = extract_images_from_pdf(paper_pdf_path)

        slide_image_features = extract_features_from_images(slide_images, model, transform)
        paper_image_features = extract_features_from_images(paper_images, model, transform)

        df.loc[folder_num] = [paper_text, slide_text, paper_image_features, slide_image_features, paper_name, slide_name]

    return df


In [None]:
# Define the base path of your dataset
base_path = "dataset"

# Initialize the ResNet model and the transform
model = resnet50(pretrained=True)
model = torch.nn.Sequential(*(list(model.children())[:-1]))  # Remove the final classification layer
model.eval()
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Process the PDF files and create the dataframe
df = process_pdf_files(base_path, model, transform)

In [None]:
def clean(text):
    text = text.replace("\n", " ")
    text = text.lower()
    if text:
        return text
    else:
        return ""

df.paper_text = df.paper_text.apply(clean)
df.slide_text = df.slide_text.apply(clean)

In [None]:
display(df)

# Textual Similarity Matrix

In [None]:
# Combine paper and slide content for TF-IDF vectorization
combined_content = df.paper_text.tolist() + df.slide_text.tolist()

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust the number of features as needed
X = vectorizer.fit_transform(combined_content)

# Split the TF-IDF matrix back into papers and slides
X_papers = X[:len(df)]
X_slides = X[len(df):]

text_similarity_matrix = cosine_similarity(X_papers, X_slides)

# Image Similarity Matrix

In [None]:
image_features_papers = np.stack(df.paper_image_features.values)
image_features_slides = np.stack(df.slide_image_features.values)

image_similarity_matrix = cosine_similarity(image_features_papers, image_features_slides)

# Calculate Combined(and Individual) Accuracies

In [None]:
def calculate_accuracy(predictions, actual):
    predictions = np.array(predictions)
    actual = np.array(actual)

    correct_predictions = np.sum(predictions == actual)
    accuracy = correct_predictions / len(actual) * 100

    return accuracy

In [None]:
results_df = pd.DataFrame(columns=['Text Weight', 'Image Weight', 'Accuracy'])

for i in range(11):
    text_weight = i / 10.0
    image_weight = 1 - text_weight
    

    combined_similarity_matrix = (text_weight * text_similarity_matrix) + (image_weight * image_similarity_matrix)
    
    slide_to_paper_mapping = np.argmax(combined_similarity_matrix, axis=0)
    
    predicted = pd.DataFrame({
        'slides': df['slide_name'],
        'PredictedPaperDocname': df['paper_name'].iloc[slide_to_paper_mapping].values
    })
    
    accuracy = calculate_accuracy(predicted['PredictedPaperDocname'], df['paper_name'])
    
    results_df = results_df.append({
        'Text Weight': text_weight,
        'Image Weight': image_weight,
        'Accuracy': accuracy
    }, ignore_index=True)

print(results_df)

# Again but filtered

In [None]:
results_df_filtered = pd.DataFrame(columns=['Text Weight', 'Image Weight', 'Accuracy'])

for i in range(11):
    text_weight = i / 10.0
    image_weight = 1 - text_weight

    combined_similarity_matrix = (text_weight * text_similarity_matrix) + (image_weight * image_similarity_matrix)
    
    slide_to_paper_mapping = np.argmax(combined_similarity_matrix, axis=0)
    predictions = pd.DataFrame({
        'slides': df['slide_name'],
        'PredictedPaperDocname': df['paper_name'].iloc[slide_to_paper_mapping].values
    })
    actual = pd.DataFrame({
        'slides': df['slide_name'],
        'PredictedPaperDocname': df['paper_name']
    })

    usable_images_filter = df['paper_image_features'].apply(lambda x: not np.all(x == 0))
    filtered_predictions = predictions[usable_images_filter]
    filtered_actual = actual[usable_images_filter]
    
    accuracy = calculate_accuracy(filtered_predictions['PredictedPaperDocname'], filtered_actual['PredictedPaperDocname'])
    
    results_df = results_df.append({
        'Text Weight': text_weight,
        'Image Weight': image_weight,
        'Accuracy': accuracy
    }, ignore_index=True)


print(results_df_filtered)