In [None]:
import yfinance as yf
import pandas as pd
import time

# Load Excel file
df = pd.read_excel("dividend_update6.11.xlsx", sheet_name="dividend_update6.11")

def clean_ticker(ticker):
    ticker = str(ticker).replace(" ", "")
    if ticker.upper().endswith("HK"):
        code = ticker[:-2].strip()
        padded_code = code.zfill(4)
        return f"{padded_code}.HK"
    elif ticker.upper().endswith("KS"):
        return f"{ticker[:-2]}.KS"
    elif ticker.upper().endswith("TT"):
        return f"{ticker[:-2]}.TW"
    elif ticker.upper().endswith("US"):
        return ticker[:-2]
    else:
        return ticker

df["Cleaned_Ticker"] = df["ticker"].apply(clean_ticker)

ticker_dividends = {}
ticker_names = {}
start_date = "2017-01-01"
end_date = "2025-06-01"

for ticker in df["Cleaned_Ticker"].unique():
    try:
        stock = yf.Ticker(ticker)
        time.sleep(1)
        # Get company name (shortName preferred, fallback to longName, else ticker)
        info = stock.info
        name = info.get("shortName") or info.get("longName") or ticker
        ticker_names[ticker] = name

        # Get dividends and filter by date
        dividends = stock.dividends
        if not dividends.empty:
            dividends = dividends[(dividends.index >= start_date) & (dividends.index <= end_date)]
            if not dividends.empty:
                sorted_dividends = sorted(
                    [(date.strftime("%Y-%m-%d"), round(amount, 4))
                     for date, amount in dividends.items()],
                    key=lambda x: x[0]
                )
                ticker_dividends[ticker] = sorted_dividends
    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")
        continue

max_dividends = max(len(d) for d in ticker_dividends.values()) if ticker_dividends else 0
columns = ["Ticker", "Name"]
for i in range(1, max_dividends + 1):
    columns.extend([f"Dividend_{i}_Date", f"Dividend_{i}_Amount"])

result_data = []
for ticker, dividends in ticker_dividends.items():
    row = [ticker, ticker_names.get(ticker, "")]
    for date, amount in dividends:
        row.extend([date, amount])
    row += [None] * (max_dividends * 2 - len(dividends) * 2)
    result_data.append(row)

if result_data:
    result_df = pd.DataFrame(result_data, columns=columns)
    result_df.to_csv("ticker_dividends_wide.csv", index=False)
    print("Successfully saved to ticker_dividends_wide.csv")
else:
    print("No dividend data found.")


$6837.HK: possibly delisted; no timezone found
$SFUN: possibly delisted; no timezone found


Error processing YY: HTTP Error 404: 
Error processing 8299.TW: HTTP Error 404: 


$WUBA: possibly delisted; no timezone found
$CTRP: possibly delisted; no timezone found


Error processing 5347.TW: HTTP Error 404: 


$FTR: possibly delisted; no timezone found


Error processing 329180..KS: HTTP Error 404: 
Successfully saved to ticker_dividends_wide.csv


In [5]:
import pandas as pd
from rapidfuzz import process, fuzz
from datetime import datetime

# ==================================================================
# 1. Load and Prepare Data
# ==================================================================
# Load security holdings
df_holdings = pd.read_csv("Greater_China_2023_Track_Record.csv", 
                         usecols=['Product', 'Currency', 'Contracts', 'Last Updated', 'Exchange Rate'],
                         parse_dates=['Last Updated'])

# Load dividend data
df_dividends = pd.read_csv("new_ticker_dividends.csv")

existing_keys = set()
for _, row in df_dividends.iterrows():
    for i in range(1, 7):
        date_col = f'Dividend_{i}_Date'
        amt_col = f'Dividend_{i}_Amount'
        if pd.notna(row[date_col]) and pd.notna(row[amt_col]):
            key = (
                row['Name'].strip().upper(),
                pd.to_datetime(row[date_col]),
                float(row[amt_col])
            )
            existing_keys.add(key)

# ==================================================================
# 2. Clean and Transform Dividend Data
# ==================================================================
# Melt dividend columns to long format
dividend_cols = [col for col in df_dividends.columns if col.startswith('Dividend_')]
df_dividends_long = pd.melt(
    df_dividends,
    id_vars=['Ticker', 'Name'],
    value_vars=dividend_cols,
    var_name='Dividend_Type',
    value_name='Value'
)

# Separate dates and amounts
df_dates = df_dividends_long[df_dividends_long['Dividend_Type'].str.contains('Date')]
df_amounts = df_dividends_long[df_dividends_long['Dividend_Type'].str.contains('Amount')]

# Merge clean dividend data
df_dividends_clean = pd.merge(
    df_dates.rename(columns={'Value': 'Ex-Date'}),
    df_amounts.rename(columns={'Value': 'Dividend'}),
    left_on=['Ticker', 'Name', df_dates.index],
    right_on=['Ticker', 'Name', df_amounts.index]
).dropna()

# Convert dates and filter
df_dividends_clean['Ex-Date'] = pd.to_datetime(df_dividends_clean['Ex-Date'])
df_dividends_clean = df_dividends_clean[df_dividends_clean['Ex-Date'] >= '2024-01-01']

# ==================================================================
# 3. Match Security Names
# ==================================================================
def fuzzy_match(name, choices):
    match, score, _ = process.extractOne(
        name, choices, 
        scorer=fuzz.token_set_ratio,
        score_cutoff=85
    )
    return match if score >= 85 else None

security_names = df_holdings['Product'].unique()
df_dividends_clean['Matched Security'] = df_dividends_clean['Name'].apply(
    lambda x: fuzzy_match(x, security_names)
)

# ==================================================================
# 4. Calculate Holdings at Dividend Dates
# ==================================================================
# Create holding timeline for each security
def get_holdings_at_date(group):
    return group.set_index('Last Updated')['Contracts'].resample('D').ffill()

holdings_timeline = df_holdings.groupby('Product').apply(get_holdings_at_date).reset_index()

df_dividends_clean = df_dividends_clean.rename(columns={'Matched Security': 'Product'})

# Then proceed with the merge_asof
merged = pd.merge_asof(
    df_dividends_clean.sort_values('Ex-Date'),
    holdings_timeline.sort_values('Last Updated').rename(columns={'Last Updated': 'Holding Date'}),
    left_on='Ex-Date',
    right_on='Holding Date',
    by='Product',  # Now both DataFrames have 'Product'
    direction='backward'
)

merged['Key'] = merged.apply(
    lambda x: (
        str(x['Name']).strip().upper(),
        pd.to_datetime(x['Ex-Date']),
        float(x['Dividend'])
    ),
    axis=1
)
merged['Exists'] = merged['Key'].isin(existing_keys)

# ==================================================================
# 5. Currency Conversion and Dividend Calculation
# ==================================================================
# Get currency info from holdings
currency_map = df_holdings.drop_duplicates('Product').set_index('Product')['Currency']
exchange_map = df_holdings.drop_duplicates('Product').set_index('Product')['Exchange Rate']

currency_df = df_holdings[['Product', 'Currency']].drop_duplicates()
exchange_df = df_holdings[['Product', 'Exchange Rate']].rename(columns={'Exchange Rate': 'FX Rate'}).drop_duplicates()

# Merge using columns
merged = merged.merge(currency_df, on='Product', how='left').merge(exchange_df, on='Product', how='left')

# Convert dividends to holding currency
merged['Converted Dividend'] = merged['Dividend'] * merged['FX Rate']
merged.loc[merged['Currency'] == 'USD', 'Converted Dividend'] = merged['Dividend']

# Calculate dividend amount
merged['Dividend Payment'] = merged['Contracts'] * merged['Converted Dividend']

# ==================================================================
# 6. Identify Missing Entries
# ==================================================================
# Create composite key for comparison
merged['Key'] = merged.apply(lambda x: f"{x['Product']}|{x['Ex-Date']}|{x['Dividend']}", axis=1)

# Get existing entries from original data
existing_keys = set()
for _, row in df_dividends.iterrows():
    for i in range(1,7):
        date_col = f'Dividend_{i}_Date'
        amt_col = f'Dividend_{i}_Amount'
        if pd.notna(row[date_col]) and pd.notna(row[amt_col]):
            key = f"{row['Name']}|{row[date_col]}|{row[amt_col]}"
            existing_keys.add(key)

merged['Exists'] = merged['Key'].isin(existing_keys)

# ==================================================================
# 7. Generate Final Output
# ==================================================================
output_cols = [
    'Product', 'Ex-Date', 'Dividend', 'Currency',
    'Converted Dividend', 'Contracts', 'Dividend Payment'
]

missing_dividends = merged[~merged['Exists']][output_cols]
missing_dividends.to_csv('missing_dividend_entries.csv', index=False)

print(f"Found {len(missing_dividends)} missing dividend entries")
print("Output saved to missing_dividend_entries.csv")


  holdings_timeline = df_holdings.groupby('Product').apply(get_holdings_at_date).reset_index()


Found 0 missing dividend entries
Output saved to missing_dividend_entries.csv
