<a href="https://colab.research.google.com/github/ort-eila/git_kundaje_annotations/blob/main/step_1_kb_from_wikipedia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# kb from wikipedia

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Define the destination folder in Google Drive
destination_folder = '/content/drive/MyDrive/bio-llm/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import requests
import bs4
from bs4 import BeautifulSoup
import os
import pandas as pd
import string  # Import the string module

In [None]:
def extract_text_from_wikipedia(url, txt_filename):
    try:
        res = requests.get(url)
        res.raise_for_status()
        wiki = bs4.BeautifulSoup(res.text, "html.parser")

        # Extract all paragraphs into a list
        paragraphs = [paragraph.getText() for paragraph in wiki.select('p')]

        # Join the paragraphs into a single text with a newline character at the end
        text = ''.join(paragraphs)
        # print("text is ",text)
        text = text.replace("\n", "")

        # Open a file named after the Wikipedia page in write mode
        with open(txt_filename, "w", encoding="utf-8") as f:
            f.write(text) # + '\n')

        print(f"Extracted text from '{url}' and saved to '{txt_filename}'.")
    except Exception as e:
        print(f"Error extracting text: {str(e)}")


In [None]:
def update_csv_with_wikipedia_details(url_list, csv_filename, remove_wiki_page_list=None):
    if os.path.exists(csv_filename):
        # Read the existing CSV into a DataFrame with headers
        existing_df = pd.read_csv(csv_filename)
    else:
        # Create an empty DataFrame with headers if the CSV doesn't exist
        existing_df = pd.DataFrame(columns=['title', 'text'])

    # Create a set of existing titles for fast lookup
    existing_titles = set(existing_df['title'])

    # Create an empty list to store DataFrames
    df_list = []

    for wiki_page in url_list:
        if wiki_page in existing_titles:
            print(f"Skipping '{wiki_page}' as it already exists in the CSV.")
            continue

        # Define the Wikipedia page URL
        url = f'https://en.wikipedia.org/wiki/{wiki_page}'

        # Define the filename for the text file
        txt_filename = f"{wiki_page}.txt"

        # Call the extract_text_from_wikipedia method to save text to a file
        extract_text_from_wikipedia(url, txt_filename)

        # Check if the text file was successfully generated
        if os.path.exists(txt_filename):
            # Read the content from the text file
            with open(txt_filename, 'r', encoding='utf-8') as txt_file:
                text = txt_file.read()

            # Create a DataFrame with the title and text
            df = pd.DataFrame({'title': [wiki_page], 'text': [text]})

            # Append the DataFrame to the list
            df_list.append(df)

            # Remove the generated text file
            #DEBUG
            # os.remove(txt_filename)

            print(f"Downloaded '{wiki_page}' and created DataFrame.")
        else:
            print(f"Failed to download '{wiki_page}'.")

    if df_list:
        # Concatenate all DataFrames in the list into one
        final_df = pd.concat(df_list, ignore_index=True)

        if os.path.exists(csv_filename):
            # Append the final DataFrame to the existing CSV file without writing the header
            final_df.to_csv(csv_filename, mode='a',sep="\t" ,header=False, index=False)
        else:
            # Write the final DataFrame with the header if the CSV doesn't exist
            final_df.to_csv(csv_filename, mode='w', sep="\t", header=True, index=False)

        print(f"Updated CSV file '{csv_filename}' with missing Wikipedia page details.")
    else:
        print("No new data to update in the CSV.")

    # Remove entries from the CSV based on remove_wiki_page_list
    if remove_wiki_page_list:
        existing_df = existing_df[~existing_df['title'].isin(remove_wiki_page_list)]
        existing_df.to_csv(csv_filename, index=False)

        print(f"Removed specified entries from CSV.")


In [None]:
# Example usage with a list of wiki_page names to update and remove
wiki_page_list = [
    "BRCA_mutation",
    "BRCA1",
    "BRCA2",
    "Tumor_suppressor_genes",
    "Adenomatous_polyposis_coli",
    "ATOH1",
    "BCL10",
    "Cadherin-1",
    "Capicua_(protein)",
    "CDKN1B",
    "CHEK2",
    "Cyclin-dependent_kinase_inhibitor_1C",
    "DHX15",
    "DLD/NP1",
    "Protein",
    "ABL_(gene)",
    "AKT1",
    "Androgen_receptor",
    "Ataxia_telangiectasia_and_Rad3_related",
    "threonine_kinase",
    "ATF1",
    "BACH1",
    "BARD1",
    "BRCC3",
    "BRE_(gene)",
    "BRIP1",
    "Transcription_factor_Jun",
    "CHEK2",
    "CLSPN",
    "Cofactor_of_BRCA1",
    "CREB-binding_protein",
    "CSNK2B",
    "CSTF2",
    "Cyclin-dependent_kinase_2",
    "RNA_Helicase_A",
    "ELK4",
    "EP300",
    "Estrogen_receptor_alpha",
    "FANCA",
    "FANCD2",
    "FHL2",
    "H2AFX",
    "JUNB",
    "JunD",
    "LMO4",
    "MAP3K3",
    "MED1",
    "MED17",
    "MED21",
    "MED24",
    "MRE11A",
    "MSH2",
    "MSH3",
    "MSH6"
#     # Add more wiki_page names as needed
]

wiki_page_list = list(set(wiki_page_list))
remove_wiki_page_list = []
    # "BRCA1",
    # "BRCA2",
    # "Tumor_suppressor_genes",
    # "Adenomatous_polyposis_coli",
    # "ATOH1",
    # "BCL10",
    # "Cadherin-1",
    # "Capicua_(protein)",
    # "CDKN1B"]
    # Add more page titles to remove as needed
# ]



In [None]:
# Define the source file path
source_file = 'wikipedia_details.tsv'
if os.path.exists(source_file):
  os.remove(source_file)
update_csv_with_wikipedia_details(wiki_page_list, source_file, remove_wiki_page_list)


Extracted text from 'https://en.wikipedia.org/wiki/Cadherin-1' and saved to 'Cadherin-1.txt'.
Downloaded 'Cadherin-1' and created DataFrame.
Extracted text from 'https://en.wikipedia.org/wiki/Androgen_receptor' and saved to 'Androgen_receptor.txt'.
Downloaded 'Androgen_receptor' and created DataFrame.
Extracted text from 'https://en.wikipedia.org/wiki/ATOH1' and saved to 'ATOH1.txt'.
Downloaded 'ATOH1' and created DataFrame.
Extracted text from 'https://en.wikipedia.org/wiki/Cyclin-dependent_kinase_inhibitor_1C' and saved to 'Cyclin-dependent_kinase_inhibitor_1C.txt'.
Downloaded 'Cyclin-dependent_kinase_inhibitor_1C' and created DataFrame.
Extracted text from 'https://en.wikipedia.org/wiki/CSTF2' and saved to 'CSTF2.txt'.
Downloaded 'CSTF2' and created DataFrame.
Extracted text from 'https://en.wikipedia.org/wiki/Protein' and saved to 'Protein.txt'.
Downloaded 'Protein' and created DataFrame.
Extracted text from 'https://en.wikipedia.org/wiki/EP300' and saved to 'EP300.txt'.
Downloaded

In [None]:
import shutil



# Copy the file to Google Drive
shutil.copy(source_file, destination_folder)


'/content/drive/MyDrive/bio-llm/wikipedia_details.tsv'

In [None]:
source_file

'wikipedia_details.tsv'

In [None]:
destination_folder

'/content/drive/MyDrive/bio-llm/'

In [None]:
# !ls -lt