From 9c5f004cc2c0e027cd66dfd599aed40f5764a6f9 Mon Sep 17 00:00:00 2001 From: kzhdev Date: Thu, 6 Nov 2025 11:05:56 -0600 Subject: [PATCH 1/4] Fix 403 Forbidden error; Remove FutureWarning: --- scripts/data_collector/us_index/collector.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py index 50278d11ee..16a4082620 100644 --- a/scripts/data_collector/us_index/collector.py +++ b/scripts/data_collector/us_index/collector.py @@ -7,6 +7,7 @@ from pathlib import Path from concurrent.futures import ThreadPoolExecutor from typing import List +from io import StringIO import fire import requests @@ -112,7 +113,10 @@ def calendar_list(self) -> List[pd.Timestamp]: return _calendar_list def _request_new_companies(self) -> requests.Response: - resp = requests.get(self._target_url, timeout=None) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + resp = requests.get(self._target_url, timeout=None, headers=headers) if resp.status_code != 200: raise ValueError(f"request error: {self._target_url}") @@ -128,7 +132,7 @@ def set_default_date_range(self, df: pd.DataFrame) -> pd.DataFrame: def get_new_companies(self): logger.info(f"get new companies {self.index_name} ......") _data = deco_retry(retry=self._request_retry, retry_sleep=self._retry_sleep)(self._request_new_companies)() - df_list = pd.read_html(_data.text) + df_list = pd.read_html(StringIO(_data.text)) for _df in df_list: _df = self.filter_df(_df) if (_df is not None) and (not _df.empty): @@ -226,7 +230,13 @@ def bench_start_date(self) -> pd.Timestamp: def get_changes(self) -> pd.DataFrame: logger.info(f"get sp500 history changes......") # NOTE: may update the index of the table - changes_df = pd.read_html(self.WIKISP500_CHANGES_URL)[-1] + # Add headers to avoid 403 Forbidden error from Wikipedia + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + response = requests.get(self.WIKISP500_CHANGES_URL, headers=headers) + response.raise_for_status() + changes_df = pd.read_html(StringIO(response.text))[-1] changes_df = changes_df.iloc[:, [0, 1, 3]] changes_df.columns = [self.DATE_FIELD_NAME, self.ADD, self.REMOVE] changes_df[self.DATE_FIELD_NAME] = pd.to_datetime(changes_df[self.DATE_FIELD_NAME]) From 6116fe9fe6186d1c2369e7fdb7151a1601ed8d4d Mon Sep 17 00:00:00 2001 From: kzhdev Date: Mon, 10 Nov 2025 09:47:17 -0600 Subject: [PATCH 2/4] use fake_useragent --- scripts/data_collector/us_index/collector.py | 6 ++++-- scripts/data_collector/us_index/requirements.txt | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py index 16a4082620..5020a30a81 100644 --- a/scripts/data_collector/us_index/collector.py +++ b/scripts/data_collector/us_index/collector.py @@ -14,6 +14,7 @@ import pandas as pd from tqdm import tqdm from loguru import logger +from fake_useragent import UserAgent CUR_DIR = Path(__file__).resolve().parent @@ -52,6 +53,7 @@ def __init__( ) self._target_url = f"{WIKI_URL}/{WIKI_INDEX_NAME_MAP[self.index_name.upper()]}" + self._ua = UserAgent() @property @abc.abstractmethod @@ -114,7 +116,7 @@ def calendar_list(self) -> List[pd.Timestamp]: def _request_new_companies(self) -> requests.Response: headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + 'User-Agent': self._ua.random } resp = requests.get(self._target_url, timeout=None, headers=headers) if resp.status_code != 200: @@ -232,7 +234,7 @@ def get_changes(self) -> pd.DataFrame: # NOTE: may update the index of the table # Add headers to avoid 403 Forbidden error from Wikipedia headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + 'User-Agent': self._ua.random } response = requests.get(self.WIKISP500_CHANGES_URL, headers=headers) response.raise_for_status() diff --git a/scripts/data_collector/us_index/requirements.txt b/scripts/data_collector/us_index/requirements.txt index 1d846b504f..ad2c8db226 100644 --- a/scripts/data_collector/us_index/requirements.txt +++ b/scripts/data_collector/us_index/requirements.txt @@ -3,3 +3,4 @@ requests pandas lxml loguru +fake-useragent From 7e2336935a0006c6226624002b75375013ee7b4a Mon Sep 17 00:00:00 2001 From: kzhdev Date: Wed, 12 Nov 2025 10:26:17 -0600 Subject: [PATCH 3/4] Fix lint format error --- scripts/data_collector/us_index/collector.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py index 5020a30a81..9c5a111fb3 100644 --- a/scripts/data_collector/us_index/collector.py +++ b/scripts/data_collector/us_index/collector.py @@ -115,9 +115,7 @@ def calendar_list(self) -> List[pd.Timestamp]: return _calendar_list def _request_new_companies(self) -> requests.Response: - headers = { - 'User-Agent': self._ua.random - } + headers = {"User-Agent": self._ua.random} resp = requests.get(self._target_url, timeout=None, headers=headers) if resp.status_code != 200: raise ValueError(f"request error: {self._target_url}") @@ -233,9 +231,7 @@ def get_changes(self) -> pd.DataFrame: logger.info(f"get sp500 history changes......") # NOTE: may update the index of the table # Add headers to avoid 403 Forbidden error from Wikipedia - headers = { - 'User-Agent': self._ua.random - } + headers = {"User-Agent": self._ua.random} response = requests.get(self.WIKISP500_CHANGES_URL, headers=headers) response.raise_for_status() changes_df = pd.read_html(StringIO(response.text))[-1] From b9431451a2c248abb8db68427ec6ad56aeeffbe5 Mon Sep 17 00:00:00 2001 From: kzhdev Date: Thu, 13 Nov 2025 09:46:37 -0600 Subject: [PATCH 4/4] Add timeout to fix pylint error --- scripts/data_collector/us_index/collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py index 9c5a111fb3..800560d2e1 100644 --- a/scripts/data_collector/us_index/collector.py +++ b/scripts/data_collector/us_index/collector.py @@ -232,7 +232,7 @@ def get_changes(self) -> pd.DataFrame: # NOTE: may update the index of the table # Add headers to avoid 403 Forbidden error from Wikipedia headers = {"User-Agent": self._ua.random} - response = requests.get(self.WIKISP500_CHANGES_URL, headers=headers) + response = requests.get(self.WIKISP500_CHANGES_URL, headers=headers, timeout=None) response.raise_for_status() changes_df = pd.read_html(StringIO(response.text))[-1] changes_df = changes_df.iloc[:, [0, 1, 3]]