# Imports

In [19]:
import requests
import networkx as nx
import time
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
import fitz  # PyMuPDF
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
import io
import matplotlib.pyplot as plt

# Data Preprocessing and Loading

In [20]:
def extract_text_from_pdf(pdf_path):
    try:
        pdf_document = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        return ""


In [21]:
def extract_largest_text_block(pdf_path):
    document = fitz.open(pdf_path)
    
    largest_font_size = 0
    largest_font_text = ""

    
    page = document[0]
    blocks = page.get_text("dict")["blocks"]
    for block in blocks:
        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    font_size = int(span["size"])
                    if not span["text"] == "inf" and len(span["text"]) > 10:
                        if font_size > largest_font_size:
                            largest_font_size = font_size
                            largest_font_text = span["text"]
                        elif font_size == largest_font_size:
                            largest_font_text += " " + span["text"]
    return largest_font_text


In [22]:
def get_id(title):
    time.sleep(2)
    url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={title}&fields=paperId,title"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if 'data' in data:
            paper_id = data["data"][0]["paperId"]
            return paper_id
    return response.status_code

In [25]:
def process_pdf_files(base_path):
    df = pd.DataFrame(columns=['paper_text', 'slide_text', "paper_name", "slide_name", "title", "paper_id"])

    
    for folder_num in tqdm(range(4984)):
        folder_path = os.path.join(base_path, str(folder_num))
        slide_name = None
        paper_name = None
            
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".pdf"):
                if "slide" in file_name or "Slide" in file_name:
                    slide_name = file_name
                else:
                    paper_name = file_name
                    
        slide_pdf_path = os.path.join(folder_path, slide_name)
        paper_pdf_path = os.path.join(folder_path, paper_name)
        
        slide_text = extract_text_from_pdf(slide_pdf_path)
        paper_text = extract_text_from_pdf(paper_pdf_path)
        
        
        title = extract_largest_text_block(paper_pdf_path)
        paper_id = get_id(title)
        
        df.loc[folder_num] = [paper_text, slide_text, paper_name, slide_name, title, paper_id]
        
    while (df['paper_id'] == 429).sum() > 0:
        print((df['paper_id'] == 429).sum())
        df.loc[df['paper_id'] == 429, 'paper_id'] = df.loc[df['paper_id'] == 429, 'title'].apply(get_id)

    return df


In [None]:
# Define the base path of your dataset
base_path = "dataset"

# Process the PDF files and create the dataframe
df = process_pdf_files(base_path)







  0%|                                                                                         | 0/4984 [00:00<?, ?it/s][A[A[A[A[A




  0%|                                                                               | 1/4984 [00:02<3:51:40,  2.79s/it][A[A[A[A[A




  0%|                                                                               | 2/4984 [00:05<3:50:43,  2.78s/it][A[A[A[A[A




  0%|                                                                               | 3/4984 [00:07<3:38:33,  2.63s/it][A[A[A[A[A




  0%|                                                                               | 4/4984 [00:10<3:41:01,  2.66s/it][A[A[A[A[A




  0%|                                                                               | 5/4984 [00:13<3:39:45,  2.65s/it][A[A[A[A[A




  0%|                                                                               | 6/4984 [00:15<3:39:27,  2.65s/it][A[A[A[A[A




  0%|                 

In [None]:
def clean(text):
    text = str(text)
    text = text.replace("\n", " ")
    text = text.lower()
    if text:
        return text
    else:
        return ""

df.paper_text = df.paper_text.apply(clean)
df.slide_text = df.slide_text.apply(clean)

In [None]:
display(df)

# text similarity

In [None]:
combined_content = df.paper_text.tolist() + df.slide_text.tolist()

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(combined_content)

X_papers = X[:len(df)]
X_slides = X[len(df):]

text_similarity_matrix = cosine_similarity(X_papers, X_slides)

# get references

In [None]:
def get_references(paper_id):
    time.sleep(2)
    url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        references = [entry['citedPaper']['paperId'] for entry in data['data'] if entry['citedPaper']['paperId'] is not None]
        return  references
    else:
        return response.status_code

In [None]:
df['references'] = df['paper_id'].apply(get_references)

while (df['references'] == 429).any():
    print((df['references'] == 429).sum())
    df.loc[df['references'] == 429, 'references'] = df.loc[df['references'] == 429, 'paper_id'].apply(get_references)
    
display(df)

In [None]:
valid_ids = set(df['paper_id'])

def filter_references(ref_list):
    if isinstance(ref_list, list):
        return [ref for ref in ref_list if ref in valid_ids]
    return []

df['filtered_references'] = df['references'].apply(filter_references)

# Network Creation

In [None]:
G = nx.DiGraph()  # Directed graph

# Create edges
edges = [(row['paper_id'], ref) for _, row in df.iterrows() for ref in row['filtered_references']]

# Add nodes and edges to the graph only if they have edges
nodes_with_edges = set([edge[0] for edge in edges] + [edge[1] for edge in edges])
G.add_nodes_from(nodes_with_edges)
G.add_edges_from(edges)

In [None]:
print("Nodes:", len(G.nodes()))
print("Edges:", len(G.edges()))

# Pearson R for paper and slide similarity

In [None]:
from scipy.stats import pearsonr

tfidf_paper = vectorizer.fit_transform(df['paper_text'])
cosine_sim_paper = cosine_similarity(tfidf_paper)

# Calculate TFIDF for slide texts
tfidf_slide = vectorizer.fit_transform(df['slide_text'])
cosine_sim_slide = cosine_similarity(tfidf_slide)

# Flatten the upper triangular matrices of the cosine similarity matrices
indices = np.triu_indices_from(cosine_sim_paper, k=1)
cosine_sim_paper_flat = cosine_sim_paper[indices]
cosine_sim_slide_flat = cosine_sim_slide[indices]

# Calculate Pearson correlation
correlation, p_value = pearsonr(cosine_sim_paper_flat, cosine_sim_slide_flat)

# Display the results
correlation, p_value

# Paper Similarity Analysis

In [None]:
from scipy.stats import ttest_ind

# Extract linked pairs from the network
linked_pairs = list(G.edges())

# Extract all pairs from the similarity matrix
all_pairs = [(i, j) for i in range(len(df)) for j in range(i + 1, len(df))]

# Create lists to store similarities
p_linked_similarities = []
p_non_linked_similarities = []

# Populate the lists with cosine similarities
for i, j in tqdm(all_pairs):
    similarity = cosine_sim_slide[i, j]
    if (df['paper_id'].iloc[i], df['paper_id'].iloc[j]) in linked_pairs or (df['paper_id'].iloc[j], df['paper_id'].iloc[i]) in linked_pairs:
        p_linked_similarities.append(similarity)
    else:
        p_non_linked_similarities.append(similarity)



In [None]:
p_linked_average = np.mean(p_linked_similarities)
p_linked_std = np.std(p_linked_similarities)

p_non_linked_average = np.mean(p_non_linked_similarities)
p_non_linked_std = np.std(p_non_linked_similarities)

print("Linked Similarities - Average:", p_linked_average, "Standard Deviation:", p_linked_std)
print("Non-Linked Similarities - Average:", p_non_linked_average, "Standard Deviation:", p_non_linked_std)

In [None]:
# Perform a statistical test to check if there's a significant difference
t_stat, p_value = ttest_ind(p_linked_similarities, p_non_linked_similarities)

# Display the results
t_stat, p_value

In [None]:
# Create a figure and axis
fig, ax1 = plt.subplots()

# Plot the non-linked histogram on ax1 (left y-axis)
ax1.hist(p_non_linked_similarities, bins=30, alpha=0.5, label='Non-Linked Similarities', color='red')
ax1.set_xlabel('Similarity')
ax1.set_ylabel('Frequency (Non-Linked)', color='red')
ax1.tick_params(axis='y', labelcolor='red')

# Create a second y-axis sharing the same x-axis
ax2 = ax1.twinx()
ax2.hist(p_linked_similarities, bins=30, alpha=0.5, label='Linked Similarities', color='blue')
ax2.set_ylabel('Frequency (Linked)', color='blue')
ax2.tick_params(axis='y', labelcolor='blue')

# Add a title

# Show the plot
plt.show()


# Slides Similarity Network Analysis

In [None]:
# Extract linked pairs from the network
linked_pairs = list(G.edges())

# Extract all pairs from the similarity matrix
all_pairs = [(i, j) for i in range(len(df)) for j in range(i + 1, len(df))]

# Create lists to store similarities
s_linked_similarities = []
s_non_linked_similarities = []

# Populate the lists with cosine similarities
for i, j in tqdm(all_pairs):
    similarity = cosine_sim_paper[i, j]
    if (df['paper_id'].iloc[i], df['paper_id'].iloc[j]) in linked_pairs or (df['paper_id'].iloc[j], df['paper_id'].iloc[i]) in linked_pairs:
        s_linked_similarities.append(similarity)
    else:
        s_non_linked_similarities.append(similarity)



In [None]:
s_linked_average = np.mean(s_linked_similarities)
s_linked_std = np.std(s_linked_similarities)

s_non_linked_average = np.mean(s_non_linked_similarities)
s_non_linked_std = np.std(s_non_linked_similarities)

print("Linked Similarities - Average:", s_linked_average, "Standard Deviation:", s_linked_std)
print("Non-Linked Similarities - Average:", s_non_linked_average, "Standard Deviation:", s_non_linked_std)

In [None]:
# Perform a statistical test to check if there's a significant difference
t_stat, p_value = ttest_ind(s_linked_similarities, s_non_linked_similarities)

# Display the results
t_stat, p_value

In [None]:
# Create a figure and axis
fig, ax1 = plt.subplots()

# Plot the non-linked histogram on ax1 (left y-axis)
ax1.hist(s_non_linked_similarities, bins=30, alpha=0.5, label='Non-Linked Similarities', color='red')
ax1.set_xlabel('Similarity')
ax1.set_ylabel('Frequency (Non-Linked)', color='red')
ax1.tick_params(axis='y', labelcolor='red')

# Create a second y-axis sharing the same x-axis
ax2 = ax1.twinx()
ax2.hist(s_linked_similarities, bins=30, alpha=0.5, label='Linked Similarities', color='blue')
ax2.set_ylabel('Frequency (Linked)', color='blue')
ax2.tick_params(axis='y', labelcolor='blue')

# Add a title

# Show the plot
plt.show()

# Draw Network

In [None]:
plt.figure(figsize=(15, 15))  # Increase the size of the image
nx.draw(G, with_labels=False, node_size=10, width=0.1)
plt.title("Network Visualization")
plt.show()