In [None]:
import json
import os
import shutil
import requests
from difflib import SequenceMatcher
import fnmatch
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

class NotebookProcessor:
    def __init__(self, notebook_path):
        self.notebook_path = notebook_path
        self.notebook_data = None
        self.urls = []
        self.downloaded_files = []  # Track downloaded files
        self.load_notebook()

    def load_notebook(self):
        """Loads the notebook content into memory."""
        with open(self.notebook_path, 'r') as file:
            self.notebook_data = json.load(file)

    def save_notebook(self, save_path=None):
        """Saves the notebook content from memory back to file."""
        if save_path is None:
            save_path = self.notebook_path
        with open(save_path, 'w') as file:
            json.dump(self.notebook_data, file, indent=2)
        
    def clean_downloaded_files(self):
        # Delete downloaded files
        for file_path in self.downloaded_files:
            try:
                os.remove(file_path)
                print(f"Successfully deleted: {file_path}")
            except Exception as e:
                print(f"Error deleting {file_path}: {e}")

    
    def clean(self, directories_to_clean=[]):
        """Cleans all outputs from code cells in the notebook, deletes downloaded files, and clears specified directories."""
        if self.notebook_data is None:
            print("Notebook data is not loaded.")
            return self

        # Clean code cell outputs
        for cell in self.notebook_data['cells']:
            if cell['cell_type'] == 'code':
                cell['outputs'] = []
                cell['execution_count'] = None
        
        self.clean_downloaded_files()
        
        # Clear specified directories
        for directory in directories_to_clean:
            if os.path.exists(directory):
                for filename in os.listdir(directory):
                    file_path = os.path.join(directory, filename)
                    try:
                        if os.path.isfile(file_path) or os.path.islink(file_path):
                            os.unlink(file_path)
                        elif os.path.isdir(file_path):
                            shutil.rmtree(file_path)
                        print(f"Removed: {file_path}")
                    except Exception as e:
                        print(f'Failed to delete {file_path}. Reason: {e}')
        
        self.save_notebook()

    def download_files(self, urls, directory):
        self.urls = urls
        """Downloads files from the given URLs into the specified directory."""
        os.makedirs(directory, exist_ok=True)  # Ensure the directory exists
        downloaded_files = []
        for url in self.urls:
            try:
                local_filename = url.split('/')[-1]  # Extract the file name from the URL
                local_filepath = os.path.join(directory, local_filename)  # Full local path
                downloaded_files.append(local_filepath)
                with requests.get(url, stream=True) as r:
                    r.raise_for_status()
                    with open(local_filepath, 'wb') as f:
                        for chunk in r.iter_content(chunk_size=8192):
                            f.write(chunk)
                self.downloaded_files.append(local_filepath)  # Track downloaded file
                print(f"Downloaded and saved: {local_filepath}")
            except Exception as e:
                print(f"Error downloading {url}: {e}")
        self.download_files= downloaded_files

    def export_to_py(self, output_path):
        """Exports notebook code cells to a Python (.py) file."""
        with open(output_path, 'w') as py_file:
            for cell in self.notebook_data['cells']:
                if cell['cell_type'] == 'code':
                    py_file.write(''.join(cell['source']) + '\n\n')
        print(f"Python file created at: {output_path}")

    @staticmethod
    def find_notebooks(directory):
        """Recursively finds all notebooks in a directory."""
        notebooks = []
        for root, _, filenames in os.walk(directory):
            for filename in fnmatch.filter(filenames, '*.ipynb'):
                notebooks.append(os.path.join(root, filename))
        return notebooks

    @staticmethod
    def cell_similarity(cell_a, cell_b):
        """Calculates the similarity of two cells using a simple ratio."""
        return SequenceMatcher(None, cell_a, cell_b).ratio()

    @classmethod
    def compare_notebooks(cls, notebook_path_a, notebook_path_b):
        """Compares two notebooks and returns the similarity metrics."""
        with open(notebook_path_a, 'r') as file_a, open(notebook_path_b, 'r') as file_b:
            notebook_a = json.load(file_a)
            notebook_b = json.load(file_b)
        
        identical, similar, distinct = 0, 0, 0
        for cell_a in notebook_a['cells']:
            if cell_a['cell_type'] == 'code':
                for cell_b in notebook_b['cells']:
                    if cell_b['cell_type'] == 'code':
                        similarity = cls.cell_similarity(''.join(cell_a['source']), ''.join(cell_b['source']))
                        if similarity == 1:
                            identical += 1
                        elif similarity >= 0.6:  # Threshold for "similarity"
                            similar += 1
                        else:
                            distinct += 1
        return identical, similar, distinct

    # @staticmethod
    # def find_notebooks(directory):
    #     """Recursively finds all notebook files in the specified directory, excluding '.ipynb_checkpoints'."""
    #     notebooks = []
    #     for root, dirs, files in os.walk(directory):
    #         # Skip any directories named '.ipynb_checkpoints'
    #         dirs[:] = [d for d in dirs if d != '.ipynb_checkpoints']
    #         for file in files:
    #             if file.endswith(".ipynb"):
    #                 notebooks.append(os.path.join(root, file))
    #     return notebooks
    
    @staticmethod
    def find_notebooks(directory):
        """Finds all notebook files in the specified directory, excluding '.ipynb_checkpoints', without recursing into subdirectories."""
        notebooks = []
        # Ensure not to consider '.ipynb_checkpoints' by filtering directories
        for file in os.listdir(directory):
            filepath = os.path.join(directory, file)
            if file.endswith(".ipynb") and '.ipynb_checkpoints' not in filepath:
                notebooks.append(filepath)
        return notebooks

    @classmethod
    def compare_notebooks(cls, notebook_path_a, notebook_path_b):
        """Compares two notebooks based on their code cells content."""
        # Load notebooks
        with open(notebook_path_a, 'r', encoding='utf-8') as file_a:
            notebook_a = json.load(file_a)
        with open(notebook_path_b, 'r', encoding='utf-8') as file_b:
            notebook_b = json.load(file_b)
        
        # Initialize counts
        identical, similar, distinct = 0, 0, 0
        
        # Extract and compare code cells
        code_cells_a = [cell['source'] for cell in notebook_a['cells'] if cell['cell_type'] == 'code']
        code_cells_b = [cell['source'] for cell in notebook_b['cells'] if cell['cell_type'] == 'code']
        
        # Prepare for comparison
        for cell_a in code_cells_a:
            cell_a_content = ''.join(cell_a).strip()
            if not cell_a_content:  # Skip empty cells
                continue
            best_match = 0  # Track the best match for this cell
            for cell_b in code_cells_b:
                cell_b_content = ''.join(cell_b).strip()
                if not cell_b_content:  # Skip empty cells
                    continue
                similarity = SequenceMatcher(None, cell_a_content, cell_b_content).ratio()
                if similarity > best_match:
                    best_match = similarity
            if best_match == 1:
                identical += 1
            elif best_match > 0:
                similar += 1
            else:
                distinct += 1
        
        if notebook_path_a == notebook_path_b:
            # Adjust counts for self-comparison to consider non-empty cells only
            non_empty_cells = sum(1 for cell in code_cells_a if ''.join(cell).strip())
            identical = non_empty_cells
            similar = 0
            distinct = 0
        
        return identical, similar, distinct

    @classmethod
    def generate_comparison_matrix(cls, directory):
        notebooks = cls.find_notebooks(directory)
        comparison_results = []

        for i, notebook_a in enumerate(notebooks):
            for notebook_b in notebooks[i+1:]:
                identical, similar, distinct = cls.compare_notebooks(notebook_a, notebook_b)
                comparison_results.append((notebook_a, notebook_b, identical, similar, distinct))
        
        return comparison_results

    @classmethod
    def save_comparisons_to_txt(cls, comparison_results, output_path):
        """Saves the comparison results to a text file."""
        with open(output_path, 'w') as f:
            for result in comparison_results:
                f.write(f"{result[0]}, {result[1]}, Identical: {result[2]}, Similar: {result[3]}, Distinct: {result[4]}\n")

    @classmethod
    def save_comparisons_to_excel(cls, comparison_results, output_path):
        """Saves the comparison results to an Excel file."""
        # Create a DataFrame from the comparison results
        df = pd.DataFrame(comparison_results, columns=['Notebook 1', 'Notebook 2', 'Identical Cells', 'Similar Cells', 'Distinct Cells'])
        
        # Save the DataFrame to an Excel file
        df.to_excel(output_path, index=False)
        print(f"Comparison results saved to {output_path}")

    
    @classmethod
    def generate_graph(cls, comparison_results, similarity_threshold):
        """Generates and displays a graph based on the comparison results and a similarity threshold."""
        G = nx.Graph()

        # Add edges for notebook pairs that meet the similarity threshold
        for nb1, nb2, identical, similar, distinct in comparison_results:
            if identical + similar >= similarity_threshold:
                G.add_node(nb1, label=os.path.basename(nb1))
                G.add_node(nb2, label=os.path.basename(nb2))
                G.add_edge(nb1, nb2, weight=identical + similar)

        # Draw the graph
        pos = nx.spring_layout(G, k=0.5, iterations=20)
        nx.draw(G, pos, with_labels=True, node_size=2000, node_color="skyblue", font_size=10, font_weight="bold")
        edge_labels = nx.get_edge_attributes(G, 'weight')
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
        plt.show()


In [None]:
# if __name__=="__main__":
dirs = ["./img", "./img1", "./universeorthoviewdual", "./universeorthoview_48_48", "./PG_data", "./universeorthoview"]
# Directory to which we want to save the files
directory = './DataSupernovaLBLgov'
# URLs of the files to download
urls = [
    'https://irsa.ipac.caltech.edu/data/Planck/release_3/ancillary-data/cosmoparams/COM_PowerSpect_CMB-base-plikHM-TTTEEE-lowl-lowE-lensing-minimum-theory_R3.01.txt',
    'https://irsa.ipac.caltech.edu/data/Planck/release_2/all-sky-maps/maps/component-maps/cmb/COM_CMB_IQU-smica_1024_R2.02_full.fits'
]
# notebook1 = NotebookProcessor("./AAAA1_AWS_UniverseMap-GoldenCopy.ipynb")
# notebook1.download_files(urls=urls, directory=directory)
# notebook1.clean_downloaded_files()
# notebook1.clean(directories_to_clean=dirs)

In [None]:
# notebook1 = NotebookProcessor("./AAA_Final_CMB_Modeling_UniverseMap.ipynb")
# notebook1.export_to_py("./AAA_Final_CMB_Modeling_UniverseMap.py")

In [None]:
# Example usage
directory = './'
# Assuming the generate_comparison_matrix and other necessary methods are defined within the class
comparison_results = NotebookProcessor.generate_comparison_matrix(directory)
# NotebookProcessor.save_comparisons_to_txt(comparison_results, output_path)
# NotebookProcessor.generate_graph(comparison_results, similarity_threshold)

In [None]:
output_path_txt = './comparison_results.txt'
NotebookProcessor.save_comparisons_to_txt(comparison_results, output_path_txt)

In [None]:
output_path_xls = './comparison_results.xlsx'
NotebookProcessor.save_comparisons_to_excel(comparison_results, output_path_xls)

In [None]:
similarity_threshold = 115  # Define a threshold for similarity
NotebookProcessor.generate_graph(comparison_results, similarity_threshold)

In [None]:
notebook_path = "./AAAA1_AWS_UniverseMap-GoldenCopy.ipynb"
identical, similar, distinct = NotebookProcessor.compare_notebooks(notebook_path, notebook_path)

print(f"Identical: {identical}, Similar: {similar}, Distinct: {distinct}")


In [None]:
notebook1.export_to_py("./CMB_HU_latest_to_git.py")

In [None]:


# Assuming compare_notebooks returns similarity score and we have a list of tuples: (notebook1, notebook2, similarity_score)
# Example: [('nb1.ipynb', 'nb2.ipynb', 5), ...]
directory = './'
comparison_results = NotebookProcessor.generate_comparison_matrix(directory)
# print(comparison_results)

G = nx.Graph()

# Add edges between notebooks with weights based on similarity
for nb1, nb2, similarity,s2,s3 in comparison_results:
    if similarity > 0:  # Assuming we only care about notebooks with some similarity
        G.add_edge(nb1, nb2, weight=similarity)

# Draw the network
pos = nx.spring_layout(G)  # Positions for all nodes

# Nodes
nx.draw_networkx_nodes(G, pos, node_size=700)

# Edges
weights = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edges(G, pos, width=list(weights.values()))

# Labels
nx.draw_networkx_labels(G, pos, font_size=10)

plt.axis('off')
plt.show()


In [None]:
print(comparison_results)

In [None]:
import nbformat

def print_first_cells(notebook_path, num_cells=3):
    """Print the source of the first few code cells in the notebook."""
    with open(notebook_path, 'r', encoding='utf-8') as f:
        nb = nbformat.read(f, as_version=4)
        code_cells = [cell for cell in nb.cells if cell.cell_type == 'code']
        for cell in code_cells[:num_cells]:
            print(''.join(cell.source))
            print('---')  # Separator

# Example usage:
print_first_cells('./AAAA1_AWS_UniverseMap-GoldenCopy.ipynb',10)


In [None]:
def compare_specific_cells(notebook_path_a, notebook_path_b, cell_index):
    """Compare the source of a specific cell index between two notebooks."""
    with open(notebook_path_a, 'r', encoding='utf-8') as f:
        nb_a = nbformat.read(f, as_version=4)
    with open(notebook_path_b, 'r', encoding='utf-8') as f:
        nb_b = nbformat.read(f, as_version=4)
    
    cell_a = nb_a.cells[cell_index].source if cell_index < len(nb_a.cells) else "Cell index out of range"
    cell_b = nb_b.cells[cell_index].source if cell_index < len(nb_b.cells) else "Cell index out of range"

    # Print both cells for manual comparison
    print("Notebook A cell content:")
    print(cell_a)
    print("\nNotebook B cell content:")
    print(cell_b)

# Example usage:
compare_specific_cells('./AAAA_qutip_A.ipynb', './AAAA1_AWS_UniverseMap-GoldenCopy.ipynb', 10)
