In [1]:
%pip install selenium
%pip install pandas
%pip install -U sentence-transformers

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from time import sleep
from urllib.parse import urljoin
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def scrape_directory(url):
    driver = webdriver.Chrome()
    names = []
    bio_urls = []
    titles = []

    try:
        driver.get(url)

        while True:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "views-row"))
            )

            faculty_listings = driver.find_elements(By.CLASS_NAME, "views-row")
            for faculty in faculty_listings:
                try:
                    article_element = faculty.find_element(By.TAG_NAME, "article")
                    link_element = article_element.find_element(By.TAG_NAME, "a")

                    name = link_element.find_element(By.CSS_SELECTOR, "h2").text

                    #title = article_element.find_element(By.TAG_NAME, "div").text
                    title_div = article_element.find_element(By.XPATH, "./a/following-sibling::div")
                    title = title_div.text

                    link = link_element.get_attribute('href')
                    bio_url = urljoin(url, link)

                    names.append(name)
                    titles.append(title)
                    bio_urls.append(bio_url)
                    print(f"Name: {name}, Title: {title}, URL: {bio_url}")
                except NoSuchElementException:
                    continue

            try:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, "Next"))
                )

                retries = 5
                for _ in range(retries):
                    try:
                        next_button.click()
                        break
                    except ElementClickInterceptedException:
                        sleep(1)
                else:
                    print("Failed to click the 'Next' button after several retries.")
                    break

            except (TimeoutException, NoSuchElementException):
                # print("No 'Next' button found, or it's not clickable.")
                break

    except TimeoutException:
        print("Timeout while waiting for page to load")
    finally:
        driver.quit()

    df = pd.DataFrame({'Name': names, 'Title': titles, 'url': bio_urls})

    return df

def extract_biographies(df):
    driver = webdriver.Chrome()

    # Create a new column 'Research Summary'
    df['Research Summary'] = None  # Initialize with None

    for index, row in df.iterrows():
        bio_url = row['url']
        try:
            driver.get(bio_url)
            full_bio_text = ""

            # Short research description
            try:
                body_sections = driver.find_elements(By.CLASS_NAME, 'field--name-field-cu-wysiwyg')
                body_section = body_sections[1]
                full_bio_text += body_section.text + " "
            except NoSuchElementException:
                # If the 'body col-lg-7' section is not found, continue without raising an error
                pass

            # Full research description
            try:
                target_sections = driver.find_elements(By.CLASS_NAME, 'field--name-field-cu-wysiwyg')
                target_section = target_sections[2]  # Get the second element
                full_bio_text += target_section.text
            except NoSuchElementException:
                # If the target section is not found, continue without raising an error
                pass

            # Assign the 'Research Summary' to the corresponding row in the DataFrame
            if full_bio_text:
                df.at[index, 'Research Summary'] = full_bio_text
            else:
                df.at[index, 'Research Summary'] = None

        except TimeoutException:
            print(f"Timeout while loading {bio_url}")
        except Exception as e:
            print(f"Failed to extract biography from {bio_url}: {e}")

    driver.quit()
    return df

url = "https://biology.columbia.edu/content/faculty-directory"
df = scrape_directory(url)
df = extract_biographies(df)

Name: Ishmail Abdus-Saboor, Title: Associate Professor of Biological Sciences (in the Mortimer B. Zuckerman Mind Brain Behavior Institute), URL: https://biology.columbia.edu/content/ishmail-abdus-saboor
Name: Peter Andolfatto, Title: Professor of Biological Sciences, URL: https://biology.columbia.edu/content/peter-andolfatto
Name: Erin L. Barnhart, Title: Assistant Professor of Biological Sciences, Dept of Biological Sciences, URL: https://biology.columbia.edu/content/erin-l-barnhart
Name: Harmen Bussemaker, Title: Professor of Biological Sciences and of Systems Biology, URL: https://biology.columbia.edu/content/harmen-bussemaker
Name: Martin Chalfie, Title: University Professor in Biological Sciences, URL: https://biology.columbia.edu/content/martin-chalfie
Name: Lars Dietrich, Title: Associate Professor of Biological Sciences, URL: https://biology.columbia.edu/content/lars-dietrich
Name: Laura Duvall, Title: Assistant Professor of Biological Sciences, URL: https://biology.columbia.ed

In [4]:
# Initialize the sentence embedder model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

# Assuming 'Research Summary' column contains the text data
df['Research Embedding'] = df['Research Summary'].apply(lambda x: model.encode([x])[0] if x else None)
gdf = df

In [6]:
gdf = df
# Assuming 'gdf' is your original DataFrame

# Convert the 'Research Embedding' column to a list of lists
gdf['Research Embedding'] = gdf['Research Embedding'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else None)

# Convert the DataFrame to a list of dictionaries
data_list = gdf.to_dict(orient='records')

with open('faculty_data_bio.json', 'w') as json_file:
    json.dump(data_list, json_file)

In [8]:
# Load the JSON file back into a DataFrame
with open('faculty_data_bio.json', 'r') as json_file:
    loaded_data_list = json.load(json_file)

df = pd.DataFrame(loaded_data_list)

# Now, gdf_loaded should have the 'Research Embedding' column in its original form
df.head()

Unnamed: 0,Name,Title,url,Research Summary,Research Embedding
0,Ishmail Abdus-Saboor,Associate Professor of Biological Sciences (in...,https://biology.columbia.edu/content/ishmail-a...,Genes and neural circuits for pain and touch b...,
1,Peter Andolfatto,Professor of Biological Sciences,https://biology.columbia.edu/content/peter-and...,,
2,Erin L. Barnhart,"Assistant Professor of Biological Sciences, De...",https://biology.columbia.edu/content/erin-l-ba...,,
3,Harmen Bussemaker,Professor of Biological Sciences and of System...,https://biology.columbia.edu/content/harmen-bu...,Data-driven predictive modeling of gene regula...,
4,Martin Chalfie,University Professor in Biological Sciences,https://biology.columbia.edu/content/martin-ch...,American Academy of Arts & Science Member\nNat...,


In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
proposal = ["""
medical image machine learning
            """]

proposal_embedding = model.encode(proposal)[0]

In [None]:
# Calculate dot product for each row
df['Dot Product'] = df['Research Embedding'].apply(lambda x: np.dot(proposal_embedding, x) if x is not None else None)

# Sort DataFrame based on dot product scores in descending order
df_sorted = df.sort_values(by='Dot Product', ascending=False)

top_5 = df_sorted.head(5)

print(top_5.iloc[0]['Research Summary'])

top_5.head()

In [None]:
residual = proposal_embedding - top_5.iloc[0]['Research Embedding']
# print(residual)
proposal_embedding = residual

In [None]:
king = model.encode('king')
norm = np.linalg.norm(king)
king = king / norm

man = model.encode('man')
norm = np.linalg.norm(man)
man = man / norm

woman = model.encode('woman')
norm = np.linalg.norm(woman)
woman = woman / norm

queen = model.encode('queen')
norm = np.linalg.norm(queen)
queen = queen / norm

queen_test = king - man + woman
norm = np.linalg.norm(queen_test)
queen_test = queen_test / norm

print('\'king - man + woman\' match to \'king\': ', np.dot(king, queen_test))
print('\'king - man + woman\' match to \'queen\': ', np.dot(queen, queen_test))

In [None]:
# Calculate dot product for each row
df['Dot Product'] = df['Research Embedding'].apply(lambda x: np.dot(proposal_embedding, x) if x is not None else None)

# Sort DataFrame based on dot product scores in descending order
df_sorted = df.sort_values(by='Dot Product', ascending=False)

top_5 = df_sorted.head(5)

print(top_5.iloc[0]['Research Summary'])

In [None]:
df = df.drop(columns=['Dot Product'])
df.to_csv('faculty_data.csv', index=False)

In [None]:
import sqlite3
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def create_database():
    conn = sqlite3.connect('biographies.db')
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS biographies (
            id INTEGER PRIMARY KEY,
            url TEXT UNIQUE,
            biography TEXT
        )
    ''')
    conn.commit()
    conn.close()

def insert_biography(url, biography):
    conn = sqlite3.connect('biographies.db')
    cursor = conn.cursor()
    try:
        cursor.execute("INSERT INTO biographies (url, biography) VALUES (?, ?)", (url, biography))
        conn.commit()
    except sqlite3.IntegrityError:
        print(f"URL already exists in database: {url}")
    finally:
        conn.close()

def extract_biographies(bio_urls):
    driver = webdriver.Chrome()

    for bio_url in bio_urls:
        try:
            driver.get(bio_url)
            full_bio_text = ""

            try:
                body_section = driver.find_element(By.CLASS_NAME, 'body.col-lg-7')
                full_bio_text += body_section.text + " "
            except NoSuchElementException:
                pass

            try:
                target_section = driver.find_element(By.CLASS_NAME, 'col-lg-7.entity.entity-paragraphs-item.paragraphs-item-content')
                full_bio_text += target_section.text
            except NoSuchElementException:
                pass

            if full_bio_text:
                print(f"Storing biography from {bio_url}")
                insert_biography(bio_url, full_bio_text)

        except TimeoutException:
            print(f"Timeout while loading {bio_url}")
        except Exception as e:
            print(f"Failed to extract biography from {bio_url}: {e}")

    driver.quit()

# Create the database and table
create_database()

# Assuming you have a list of bio URLs
bio_urls = ["your_list_of_bio_urls_here"]
extract_biographies(bio_urls)