# USA University/Colleges Ranking

**Scrapes the Uni123 Rank website (https://www.4icu.org/us/)**

USNews would be preferred but has extensive web-scrape blocking measures 

* Data Collection: Requests is used for simple webscraping as this website did not have extensive blocking features at this time
* Data Extraction: Beautiful soup is used to parse HTML elements saved as lists
* Data Storage: Scraped elements are stored in a dataframe for further processing.
* Data Cleaning: Only small cleaning step required


In [None]:
import random
import pandas as pd
import numpy as np  
import requests
from bs4 import BeautifulSoup

In [None]:
# Scraping https://www.4icu.org/us/ Uni123 Rank website because USNews was too difficult
url = "https://www.4icu.org/us/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
school_divs = soup.find_all('tr')  # Find all divs with the given class
print(f"Number of divs found: {len(school_divs)}")

In [None]:
school_divs_test = school_divs[2]
school_divs_test

In [None]:
rank = []
school = []
city = []

for div in school_divs:
    tds = div.find_all('td')
    
    if len(tds) >= 3:
        rank.append(tds[0].text)
        school.append(tds[1].text)
        city.append(tds[2].text)
    
# Create df from dictionary
df = pd.DataFrame({'Rank': rank, 'School': school, 'City': city})
df['City'] = df['City'].replace(' ...', '', regex=True)

df['Rank'] = pd.to_numeric(df['Rank'], errors='coerce')
maxrank = df['Rank'].max()
df['Rank'] = df['Rank'].fillna(maxrank + 1)
df['Rank'] = df['Rank'].astype(int)

In [None]:
# Save the result to a dataframe
df.to_csv('USA_Uni_Rank.csv')

# Scraping to obtain State Names along with University/College Names

**Scraping from StudyAbroad.Shiksha.com to obtain States names along with University names 

(https://studyabroad.shiksha.com/usa/universities-10)

* Data Collection: Selenium must be used here again
* Data Storage: Scraped elements are stored in a dataframe for further processing.
* Data Merging: This data is merged with the (https://www.4icu.org/us/) scraped information to get as much information about state location

In [None]:
# Scraping from StudyAbroad.Shiksha.com to obtain States names along with University names

# Etracting from this website https://studyabroad.shiksha.com/usa/universities-10

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

webdriver_location = r'C:\Users\Lenovo V15\Downloads\chromedriver_win32.exe'

# Create a Service object with the webdriver_location
service = Service(executable_path=webdriver_location)

# Pass the Service object to the webdriver.Chrome constructor
driver = webdriver.Chrome(service=service)

scraped_pages = []
base_url = "https://studyabroad.shiksha.com/usa/universities"

for i in range(50):
    # Update URL based on the current iteration
    url = base_url if i == 0 else f'{base_url}-{i + 1}'
    
    #print(f"Scraping page {i + 1}: {url}")
    
    driver.get(url)
    uni_html = driver.page_source
    scraped_pages.append(BeautifulSoup(uni_html, 'html.parser'))

# Close the driver after the loop
driver.quit()

# Double check the pages scraping to confirm that all have been scraped (50)
print(f"Number of pages scraped: {len(scraped_pages)}")

In [None]:
# Store final results in a dataframe after getting all the soups

df_final = pd.DataFrame(columns=['School', 'City'])

for scrape in scraped_pages:
    school_divs = scrape.find_all('tr')

    universities = []
    locations = []

    for div in school_divs:
        # Find the university name
        uni_name_tag = div.find('a', class_='font-15')
    
        # Find the city/state information
        city_state_tag = div.find_all('td')
    
        if uni_name_tag and len(city_state_tag) >= 3:
            universities.append(uni_name_tag.text.strip())
            locations.append(city_state_tag[2].text.strip())

    # Create a DataFrame for the current scrape
    df = pd.DataFrame({'School': universities, 'City': locations})
    
    # Concatenate the current DataFrame with the final DataFrame
    df_final = pd.concat([df_final, df], ignore_index=True)

# Print the final DataFrame
df_final.shape

In [None]:
df_final[['City', 'State']] = df_final['City'].str.split(',', expand=True)
df_final['State'] = df_final['State'].str.strip()

In [None]:
# Save to a .csv file
df_final.to_csv('USA_Uni_City_State_Location.csv')

**Combine the two dataframes**

In [None]:
# Read in the dfs once more and merge
df_USA_Uni_Rank = pd.read_csv("USA_Uni_Rank.csv")
df_USA_Uni_Location = pd.read_csv("USA_Uni_City_State_Location.csv")
df_UniRank_Location = pd.merge(df_USA_Uni_Rank, df_USA_Uni_Location, left_on='School', right_on='School', how='left')
df_UniRank_Location.drop_duplicates(keep='first', inplace=True)

In [None]:
df_UniRank_Location.to_csv("USA_Uni_Rank_City_State_Index.csv")