https://medium.com/@andrejin.nagano/pair-trading-strategy-using-python-7787adb3d2e2

In [1]:
import pandas as pd
import yfinance as yf
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def get_historical_data(tickers, start=datetime(2020, 1, 1), end=datetime(2025, 1, 1)):
    """
    Iterates through the tickers list, getting all the closing prices
    for each ticker from a start date until a end date
    """

    data = pd.DataFrame()
    
    # Iterates through all the tickers
    for ticker in tickers:
        try:
            # Request the closing prices
            df = yf.Ticker(ticker).history(start=start, end=end)['Close']
            df.index = df.index.date
        except:
            pass
        
        data = pd.concat([data, df], axis=1)

    data.columns = tickers
    
    return data

# Getting a List of Companies from the Bovespa Index

As an example, I’ll be searching for pairs within the Brazilian stock market’s Bovespa Index. To get the tickers for each company, I’ll scrape the table on the Ibovespa’s Wikipedia page. We’ll also add the Bovespa index itself.

In [5]:
# pip install requests lxml
import requests
import pandas as pd

URL = "https://pt.wikipedia.org/wiki/Lista_de_companhias_citadas_no_Ibovespa"

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "pt-BR,pt;q=0.9,en;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

resp = requests.get(URL, headers=headers, timeout=20)
resp.raise_for_status()  # will raise if still forbidden

tables = pd.read_html(resp.text)  # parses all tables in the page

# pick the first table that contains 'Código' column
table = next(t for t in tables if any(c.strip().lower().startswith("código") for c in t.columns))

# clean and build tickers
codes = (
    table["Código"]
    .astype(str)
    .str.replace(r"\s+", "", regex=True)  # remove spaces
    .str.replace(r"[^A-Za-z0-9]", "", regex=True)  # drop footnote marks etc.
    .str.upper()
)

ibov_tickers = [f"{c}.SA" for c in codes if c and c != "NAN"]
ibov_tickers = sorted(set(ibov_tickers + ["BOVA11.SA"]))

print(len(ibov_tickers), ibov_tickers[:10])


65 ['ABEV3.SA', 'AZUL4.SA', 'B3SA3.SA', 'BBAS3.SA', 'BBDC3.SA', 'BBDC4.SA', 'BBSE3.SA', 'BEEF3.SA', 'BOVA11.SA', 'BPAC11.SA']


  tables = pd.read_html(resp.text)  # parses all tables in the page


Retrieving Historical Closing Prices from the Ibovespa Companies


In [6]:
df = get_historical_data(ibov_tickers)
df.dropna(axis=1, inplace=True)

HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: BRFS3.SA"}}}
$BRFS3.SA: possibly delisted; no timezone found
$CCRO3.SA: possibly delisted; no timezone found
$CIEL3.SA: possibly delisted; no timezone found
$CRFB3.SA: possibly delisted; no timezone found
$ELET3.SA: possibly delisted; no timezone found
$ELET6.SA: possibly delisted; no timezone found
$EMBR3.SA: possibly delisted; no timezone found
$GOLL4.SA: possibly delisted; no timezone found
$JBSS3.SA: possibly delisted; no timezone found
$MRFG3.SA: possibly delisted; no timezone found
$NTCO3.SA: possibly delisted; no timezone found


show bist 100

In [8]:
# If needed in a fresh environment:
# !pip install yfinance pandas matplotlib --quiet

import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt

# --- Parameters ---
INDEX_SYMBOL = "^XU100"   # alternative page sometimes exists as "XU100.IS"

# Example 10 BIST-100 tickers (edit this list if you want different ones)
tickers_10 = [
    "THYAO.IS",  # Turkish Airlines
    "BIMAS.IS",  # BIM
    "ASELS.IS",  # Aselsan
    "TUPRS.IS",  # Tupras
    "EREGL.IS",  # Erdemir
    "AKBNK.IS",  # Akbank
    "ISCTR.IS",  # Isbank
    "GARAN.IS",  # Garanti
    "YKBNK.IS",  # Yapi Kredi
    "KRDMD.IS"   # Kardemir D
]

# Date range
period = "1y"        # last 1 year
interval = "1d"      # daily bars

# --- Download data ---
idx = yf.download(INDEX_SYMBOL, period=period, interval=interval)["Adj Close"].rename("XU100")
prices = yf.download(tickers_10, period=period, interval=interval)["Adj Close"]

# Align dates and drop rows where all series are NaN
df = pd.concat([idx, prices], axis=1).dropna(how="all")

# --- Plot 1: Raw index level ---
plt.figure(figsize=(10, 4))
plt.plot(df.index, df["XU100"])
plt.title("BIST 100 (^XU100) – Adj Close (last 1y)")
plt.xlabel("Date")
plt.ylabel("Index Level (Adj Close)")
plt.grid(True)
plt.tight_layout()
plt.show()

# --- Plot 2: 10 sample tickers normalized to 100 (to compare performance) ---
# Keep only columns that actually have data over the aligned index
norm = prices.loc[df.index].dropna(axis=1, how="all").copy()

# If the very first row has NaNs for some tickers, shift the start to the first fully-available row
first_valid = norm.apply(pd.Series.first_valid_index)
if not first_valid.isnull().all():
    # Choose the latest of first-valid indices so all selected tickers start together
    start_date = max([d for d in first_valid.dropna()])
    norm = norm.loc[start_date:]

# Drop columns still missing the start value
norm = norm.dropna(axis=1, how="any").copy()

# Normalize
norm = norm / norm.iloc[0] * 100

plt.figure(figsize=(12, 6))
for col in norm.columns[:10]:  # ensure max 10 lines
    plt.plot(norm.index, norm[col], label=col)

plt.title("Sample 10 BIST-100 Stocks – Normalized (100 = first common date)")
plt.xlabel("Date")
plt.ylabel("Normalized Adj Close")
plt.grid(True)
plt.legend(ncol=2, fontsize=9)
plt.tight_layout()
plt.show()


$^XU100: possibly delisted; no price data found  (period=1y)
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['^XU100']: possibly delisted; no price data found  (period=1y)
[*********************100%***********************]  10 of 10 completed


KeyError: 'Adj Close'