# Interview TW Crawler v1.1

Updated on 2024/08/07, by Xiang-Yi Huang

### Step 0: Install Required Packages

In [None]:
!pip install selenium
!pip install beautifulsoup4
!pip install fake_useragent
!pip install webdriver_manager
!pip install undetected-chromedriver

### Step 1: Input Your Google Account
You need to have a VIP account for this website.

In this version, please notice that you need to artificially verify your Gmail account before crawling the website.

In [None]:
# TODOs: Try to avoid verification.
gmail = 'xxxx@gmail.com'
password = 'xxxxxxxx'

### Step 2: Let's Crawl !
In this version, please enter all the company URLs you want to crawl.

In [None]:
# TODOs: Get all the company URLs on the website.
company_url_list = ["A1St", "0noW8", "ECKNd"] # This is just an example.

In [None]:
# TODOs: Use fake useragent to avoid detection, and other crawling techniques.
# TODOs: Try to avoid robot detection. There are 2 conditions after you are detected: 1. Simply ask if you are a robot. 2. You are asked to solve a CAPTCHA.
# TODOs: Customize the crawler that you can crawl the data you want.
import undetected_chromedriver as uc

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time
import pandas as pd

# Other resources: https://github.com/seleniumbase/SeleniumBase
# Initialize undetected-chromedriver
driver = uc.Chrome()
        
def google_login(gmail, password):
    # Wait for and click the "Continue with Google" button
    try:
        google_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//span[text()='使用 Google 繼續']"))
        )
        google_button.click()
    except:
        google_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//span[text()='Continue with Google']"))
        )
        google_button.click()
        
    # Wait for Google login page to load and enter email
    email_input = WebDriverWait(driver, 5).until(
        EC.visibility_of_element_located((By.XPATH, "//input[@type='email']"))
    )
    email_input.send_keys(gmail)
    email_input.send_keys('\n')  # Press enter

    # Wait for password page to load and enter password
    password_input = WebDriverWait(driver, 5).until(
        EC.visibility_of_element_located((By.XPATH, "//input[@type='password']"))
    )
    password_input.send_keys(password)
    password_input.send_keys('\n')  # Press enter

    # You have 60 seconds to artificially verify your Google account.
    time.sleep(60)

    current_url = driver.current_url
    if "interview.tw" in current_url:
        print("Successfully logged in and redirected to interview.tw page")
    else:
        print("Failed to log in or redirect to the expected page")
        print("Stop crawling")
        driver.quit()
        
def interview_crawler(company_url, page_number, data):
    # Get the company name
    try:
        driver.get(f"https://interview.tw/c/{company_url}")
        company_name = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.XPATH, "//h1[@class='fz-headline-sm fz-tit']"))
        ).text
        print(f"Company name now: {company_name}")
    except Exception as e:
        company_name = "NONE"
        print(f"Error retrieving company name: {e}")
        
    while True:
        # Enter company page
        try:
            driver.get(f"https://interview.tw/c/{company_url}?page={page_number}&sort=newest")
            
            # Check for 404 error or no filter results
            try:
                WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.XPATH, "//span[text()='404']"))
                )
                print(f"Page returned 404 error, ending crawler for {company_name}")
                break
            except:
                try:
                    WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.XPATH, "//div[text()='沒有篩選結果']"))
                    )
                    print(f"Page shows no filter results, ending crawler for {company_name}")
                    break
                except:
                    pass  # Continue execution if these elements are not found
        except:
            df = pd.DataFrame(data)
            print(f"No page {page_number}, ending crawler for {company_name}")
            break
        
        # Look for VIP button and click
        try:
            vip_buttons = WebDriverWait(driver, 5).until(
                EC.presence_of_all_elements_located((By.XPATH, "//span[text()='使用 VIP 解鎖']"))
            )
            for i, button in enumerate(vip_buttons):
                button.click()
                time.sleep(10)

                # Usually happens after clicking 3 VIP buttons
                if i == 2:
                    try:
                        reconsider_button = WebDriverWait(driver, 5).until(
                            EC.element_to_be_clickable((By.XPATH, "//span[text()='再考慮']"))
                        )
                        reconsider_button.click()
                        time.sleep(10)
                    except:
                        print("No '再考慮' button")
                        pass
            total_vip_buttons = len(vip_buttons)
                    
            # If VIP button not found, check for related articles and crawl directly
            try:
                experience_items = driver.find_elements(By.XPATH, "//div[@class='iw_experience-item--body']")
                for item in experience_items[: total_vip_buttons]:
                    # Initialize values in the table
                    position = ""
                    location = ""
                    share_date = ""
                    interview_date = ""
                    interview_rating = ""
                    interview_status = ""
                    interview_difficulty = ""
                    interview_process = ""
                    interview_qa = ""
                    interview_advice = ""
                    tags = ""
                    
                    # Exception handling for various cases
                    try:
                        spans_in_item = item.find_elements(By.TAG_NAME, "span")
                        span_texts = [span.text for span in spans_in_item]

                        # Ensure text list has enough elements
                        if len(span_texts) >= 12:
                            interview_date_index = span_texts.index("面試時間")
                            
                            position = span_texts[0]
                            location = span_texts[1]
                            share_date = span_texts[3].replace(" 分享", "").strip()
                            interview_date = span_texts[interview_date_index - 1]
                            interview_rating = span_texts[interview_date_index + 1]
                            interview_status = span_texts[interview_date_index + 4]
                            interview_difficulty = span_texts[interview_date_index + 6]

                            # Ensure specific fields exist
                            if "面試過程" in span_texts:
                                interview_process = span_texts[span_texts.index("面試過程") + 1]
                            if "面試問答" in span_texts and "面試建議" in span_texts:
                                interview_qa = "\n".join(span_texts[span_texts.index("面試問答") + 1: span_texts.index("面試建議")])
                            if "面試建議" in span_texts and interview_status in span_texts:
                                interview_advice = "\n".join(span_texts[span_texts.index("面試建議") + 1: span_texts.index(interview_status, 12)])
                            if interview_status in span_texts:
                                tags = ",".join(span_texts[span_texts.index(interview_status, 12): span_texts.index("分享") - 2])
                        else:
                            print("Warning: span_texts length is less than expected.")
                        
                    except IndexError as e:
                        print(f"IndexError occurred: {e}")
                    except ValueError as e:
                        print(f"ValueError occurred: {e}")
                    except Exception as e:
                        print(f"An unexpected error occurred: {e}")
                        print("Unable to crawl data!")

                    # Add extracted data to the list
                    data.append({
                        "公司名稱": company_name,
                        "職位": position,
                        "工作地點": location,
                        "分享時間": share_date,
                        "面試時間": interview_date,
                        "面試評價": interview_rating,
                        "面試狀態": interview_status,
                        "面試難度": interview_difficulty,
                        "面試過程": interview_process,
                        "面試問答": interview_qa,
                        "面試建議": interview_advice,
                        "標籤": tags
                    })
                
                page_number += 1
                print(f"Moving to page {page_number}")
            except Exception as e:
                print(f"An unexpected error occurred: {e}")
                print("Unable to crawl data!")
        except:           
            df = pd.DataFrame(data)
            print(f"No more VIP buttons or VIP articles to fetch, ending crawler for {company_name}")
            break
    return df

# Start crawling
try:
    # Open the specified login page
    URL = "https://interview.tw/auth/login?redirect=https%3A%2F%2Finterview.tw"
    driver.get(URL)

    # Login with Google account
    google_login(gmail, password)
    
    # Enter company URLs, please note that if you have already unlocked VIP articles for a company, there might be an error. Please try a new company first.
    columns = ["公司名稱", "職位", "工作地點", "分享時間", "面試時間", "面試評價", "面試狀態", "面試難度", "面試過程", "面試問答", "面試建議", "標籤"]
    df_all = pd.DataFrame(columns = columns)
    for company_url in company_url_list:
        df = interview_crawler(company_url = company_url, page_number = 1, data = [])
        df_all = pd.concat([df_all, df], ignore_index = True)
finally:
    # Close the browser when done
    driver.quit()

df_all.to_csv("Interview_Data.csv", index = False, encoding = "utf-8-sig")

In [None]:
df_all