In [1]:
%pip install selenium
%pip install pandas
%pip install -U sentence-transformers

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from time import sleep
from urllib.parse import urljoin
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def scrape_directory(url):
    driver = webdriver.Chrome()
    names = []
    bio_urls = []
    titles = []

    try:
        driver.get(url)

        while True:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "node-biography"))
            )

            faculty_listings = driver.find_elements(By.CLASS_NAME, "node-biography")
            for faculty in faculty_listings:
                try:
                    name = faculty.find_element(By.CSS_SELECTOR, ".nodetitle a").text
                    title = faculty.find_element(By.CLASS_NAME, "biotitle").text
                    link = faculty.find_element(By.CSS_SELECTOR, ".nodetitle a").get_attribute('href')
                    bio_url = urljoin(url, link)
                    names.append(name)
                    titles.append(title)
                    bio_urls.append(bio_url)
                    # print(f"Name: {name}, Title: {title}, URL: {bio_url}")
                except NoSuchElementException:
                    continue

            try:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, "Next"))
                )

                retries = 5
                for _ in range(retries):
                    try:
                        next_button.click()
                        break
                    except ElementClickInterceptedException:
                        sleep(1)
                else:
                    print("Failed to click the 'Next' button after several retries.")
                    break

            except (TimeoutException, NoSuchElementException):
                # print("No 'Next' button found, or it's not clickable.")
                break

    except TimeoutException:
        print("Timeout while waiting for page to load")
    finally:
        driver.quit()

    df = pd.DataFrame({'Name': names, 'Title': titles, 'url': bio_urls})

    return df

def extract_biographies(df):
    driver = webdriver.Chrome()

    # Create a new column 'Research Summary'
    df['Research Summary'] = None  # Initialize with None

    for index, row in df.iterrows():
        bio_url = row['url']
        try:
            driver.get(bio_url)
            full_bio_text = ""

            # Try to find the div with class "body col-lg-7" and extract text
            try:
                body_section = driver.find_element(By.CLASS_NAME, 'body.col-lg-7')
                full_bio_text += body_section.text + " "
            except NoSuchElementException:
                # If the 'body col-lg-7' section is not found, continue without raising an error
                pass

            # Find the original target section and extract text
            try:
                target_section = driver.find_element(By.CLASS_NAME, 'col-lg-7.entity.entity-paragraphs-item.paragraphs-item-content')
                full_bio_text += target_section.text
            except NoSuchElementException:
                # If the target section is not found, continue without raising an error
                pass

            # Assign the 'Research Summary' to the corresponding row in the DataFrame
            if full_bio_text:
                df.at[index, 'Research Summary'] = full_bio_text
            else:
                df.at[index, 'Research Summary'] = None

        except TimeoutException:
            print(f"Timeout while loading {bio_url}")
        except Exception as e:
            print(f"Failed to extract biography from {bio_url}: {e}")

    driver.quit()
    return df


url = "https://www.engineering.columbia.edu/directory"
df = scrape_directory(url)
df = extract_biographies(df)

In [4]:
# Initialize the sentence embedder model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

# Assuming 'Research Summary' column contains the text data
df['Research Embedding'] = df['Research Summary'].apply(lambda x: model.encode([x])[0] if x else None)
gdf = df

In [5]:
gdf = df
# Assuming 'gdf' is your original DataFrame

# Convert the 'Research Embedding' column to a list of lists
gdf['Research Embedding'] = gdf['Research Embedding'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else None)

# Convert the DataFrame to a list of dictionaries
data_list = gdf.to_dict(orient='records')

# Save the list of dictionaries to a JSON file
with open('faculty_data_engineering.json', 'w') as json_file:
    json.dump(data_list, json_file)

In [6]:
# Load the JSON file back into a DataFrame
with open('faculty_data_engineering.json', 'r') as json_file:
    loaded_data_list = json.load(json_file)

df = pd.DataFrame(loaded_data_list)

# Now, gdf_loaded should have the 'Research Embedding' column in its original form
df.head()

Unnamed: 0,Name,Title,url,Research Summary,Research Embedding
0,ERBIL ABACI,Assistant Professor at the Department of Derma...,https://www.engineering.columbia.edu/erbil-abaci,He has received his Ph.D. degree at the Johns ...,"[-0.11000274121761322, 0.021928245201706886, -..."
1,PABLO ABREU,Senior Program Manager,https://www.engineering.columbia.edu/pablo-abreu,,
2,ANISH AGARWAL,ASSISTANT PROFESSOR OF INDUSTRIAL ENGINEERING ...,https://www.engineering.columbia.edu/faculty/a...,Anish’s research interests are in designing an...,"[-0.04620283097028732, 0.021099157631397247, 0..."
3,AJIT AGRAWAL,Adjunct Associate Professor,https://www.engineering.columbia.edu/ajit-agrawal,"Ajit is the founder of AKAnomics Inc, a startu...","[-0.049910321831703186, -0.10780283808708191, ..."
4,SUNIL K. AGRAWAL,PROFESSOR OF MECHANICAL ENGINEERING AND PROFES...,https://www.engineering.columbia.edu/faculty/s...,Sunil K. Agrawal has developed a highly visibl...,"[-0.07710743695497513, -0.12651987373828888, 0..."


In [16]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
proposal = ["""
medical image machine learning
            """]

proposal_embedding = model.encode(proposal)[0]

In [18]:
# Calculate dot product for each row
df['Dot Product'] = df['Research Embedding'].apply(lambda x: np.dot(proposal_embedding, x) if x is not None else None)

# Sort DataFrame based on dot product scores in descending order
df_sorted = df.sort_values(by='Dot Product', ascending=False)

top_5 = df_sorted.head(5)

print(top_5.iloc[0]['Research Summary'])

top_5.head()

He has received his Ph.D. degree at the Johns Hopkins University in bioengineering, and completed his postdoctoral trainings working on tissue engineering of blood-brain-barrier and skin, as part of the NIH Microphysiological Systems consortium. Dr. Abaci’s research is supported by private foundations and federal grants including the Ines Mandl Foundation, National Institutes of Health (NIH), National Science Foundation (NSF) and Department of Defense (DoD). His research lies at the intersection of regenerative medicine, stem cells, and skin and endothelial biology and disease. Dr. Abaci’s approach is based on reconstructing the integumentary system in vitro at different levels of complexity through (i) microfluidic- based iPSC-derived organoids, and (ii) 3D-bioprinted advanced skin models and grafts with the motivation to understand, repair and regenerate the skin and its appendages, and with the dream of ultimately matching and achieving beyond the inherent capabilities of human tiss

Unnamed: 0,Name,Title,url,Research Summary,Research Embedding,Dot Product
0,ERBIL ABACI,Assistant Professor at the Department of Derma...,https://www.engineering.columbia.edu/erbil-abaci,He has received his Ph.D. degree at the Johns ...,,
1,ANISH AGARWAL,ASSISTANT PROFESSOR OF INDUSTRIAL ENGINEERING ...,https://www.engineering.columbia.edu/faculty/a...,Anish’s research interests are in designing an...,,
2,SUNIL K. AGRAWAL,PROFESSOR OF MECHANICAL ENGINEERING AND PROFES...,https://www.engineering.columbia.edu/faculty/s...,Sunil K. Agrawal has developed a highly visibl...,,
3,SHIPRA AGRAWAL,CYRUS DERMAN ASSOCIATE PROFESSOR OF INDUSTRIAL...,https://www.engineering.columbia.edu/faculty/s...,Shipra Agrawal’s research spans several areas...,,
4,JOSH ALMAN,ASSISTANT PROFESSOR OF COMPUTER SCIENCE,https://www.engineering.columbia.edu/josh-alman,Josh Alman is a theoretical computer scientist...,,


In [19]:
residual = proposal_embedding - top_5.iloc[0]['Research Embedding']
# print(residual)
proposal_embedding = residual

TypeError: unsupported operand type(s) for -: 'float' and 'NoneType'

In [None]:
king = model.encode('king')
norm = np.linalg.norm(king)
king = king / norm

man = model.encode('man')
norm = np.linalg.norm(man)
man = man / norm

woman = model.encode('woman')
norm = np.linalg.norm(woman)
woman = woman / norm

queen = model.encode('queen')
norm = np.linalg.norm(queen)
queen = queen / norm

queen_test = king - man + woman
norm = np.linalg.norm(queen_test)
queen_test = queen_test / norm

print('\'king - man + woman\' match to \'king\': ', np.dot(king, queen_test))
print('\'king - man + woman\' match to \'queen\': ', np.dot(queen, queen_test))

'king - man + woman' match to 'king':  0.5789466
'king - man + woman' match to 'queen':  0.5880687


In [None]:
# Calculate dot product for each row
df['Dot Product'] = df['Research Embedding'].apply(lambda x: np.dot(proposal_embedding, x) if x is not None else None)

# Sort DataFrame based on dot product scores in descending order
df_sorted = df.sort_values(by='Dot Product', ascending=False)

top_5 = df_sorted.head(5)

print(top_5.iloc[0]['Research Summary'])

Sunil K. Agrawal has developed a highly visible interdisciplinary program in rehabilitation robotics involving faculty from School of Engineering and Applied Sciences and College of Physician and Surgeons at Columbia University. Neural disorders, such as stroke and Parkinson’s disease, limit the ability of humans to walk and perform activities of daily living. Pediatric disorders such as cerebral palsy, spina bifida, and Down’s syndrome delay the development of children and pose many functional limitations. Old age diminishes the sensory and motor systems. Through a range of pilot and clinical studies involving human subjects, Dr. Agrawal has showed that novel training robots can help humans to relearn, restore, or improve functional movements.  Agrawal has active collaborations with faculty in the departments of Neurology, Rehabilitation Medicine, Pediatric Orthopedics, Otolaryngology, Geriatrics, and Psychiatry. A selected list of these ongoing studies are: (i) Perturbation training 

In [None]:
df = df.drop(columns=['Dot Product'])
df.to_csv('faculty_data.csv', index=False)

In [None]:
import sqlite3
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def create_database():
    conn = sqlite3.connect('biographies.db')
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS biographies (
            id INTEGER PRIMARY KEY,
            url TEXT UNIQUE,
            biography TEXT
        )
    ''')
    conn.commit()
    conn.close()

def insert_biography(url, biography):
    conn = sqlite3.connect('biographies.db')
    cursor = conn.cursor()
    try:
        cursor.execute("INSERT INTO biographies (url, biography) VALUES (?, ?)", (url, biography))
        conn.commit()
    except sqlite3.IntegrityError:
        print(f"URL already exists in database: {url}")
    finally:
        conn.close()

def extract_biographies(bio_urls):
    driver = webdriver.Chrome()

    for bio_url in bio_urls:
        try:
            driver.get(bio_url)
            full_bio_text = ""

            try:
                body_section = driver.find_element(By.CLASS_NAME, 'body.col-lg-7')
                full_bio_text += body_section.text + " "
            except NoSuchElementException:
                pass

            try:
                target_section = driver.find_element(By.CLASS_NAME, 'col-lg-7.entity.entity-paragraphs-item.paragraphs-item-content')
                full_bio_text += target_section.text
            except NoSuchElementException:
                pass

            if full_bio_text:
                print(f"Storing biography from {bio_url}")
                insert_biography(bio_url, full_bio_text)

        except TimeoutException:
            print(f"Timeout while loading {bio_url}")
        except Exception as e:
            print(f"Failed to extract biography from {bio_url}: {e}")

    driver.quit()

# Create the database and table
create_database()

# Assuming you have a list of bio URLs
bio_urls = ["your_list_of_bio_urls_here"]
extract_biographies(bio_urls)
