In [2]:
#10_Ks only... about 6 minutes with filedate metadata
import os
import zipfile
import pandas as pd
import shutil
from collections import defaultdict
from tqdm import tqdm

# --- Setup ---
zip_folder = "."  # current folder
output_root = "sample_10k_data3"
sp500_file = "sp500_companies.csv"

# Download S&P500 company list if not already downloaded
if not os.path.exists(sp500_file):
    url = 'https://en.wikipedia.org/w/index.php?title=List_of_S%26P_500_companies&oldid=1130173030'
    pd.read_html(url)[0].to_csv(sp500_file, index=False)

firms = pd.read_csv(sp500_file)

# Grab set of S&P 500 tickers (not CIKs yet, but placeholder for future upgrades)
tickers_sp500 = set(firms['Symbol'].str.upper())

# --- Parameters ---
cik_files = defaultdict(list)
metadata = []  # 🆕 collect metadata here

# Create output directory if needed
os.makedirs(output_root, exist_ok=True)

# Helper function to extract CIK from filename
def extract_cik_from_member(member):
    parts = member.split("edgar_data_")
    if len(parts) < 2:
        return None
    after_edgar = parts[1]
    cik_part = after_edgar.split("_")[0]
    return cik_part if cik_part.isdigit() else None

# --- Start Processing ---
zip_files = [f for f in os.listdir(zip_folder) if f.endswith(".zip")]

with tqdm(total=len(zip_files), desc="Processing zip files") as pbar:
    for zip_filename in zip_files:
        zip_path = os.path.join(zip_folder, zip_filename)

        with zipfile.ZipFile(zip_path, 'r') as zipf:
            for member in zipf.namelist():
                if not member.endswith(".txt"):
                    continue
                if "10-K" not in member.upper():
                    continue
                if "edgar_data_" not in member:
                    continue

                # Extract CIK
                cik = extract_cik_from_member(member)
                if cik is None:
                    continue

                # Get year and quarter
                path_parts = os.path.normpath(member).split(os.sep)
                if len(path_parts) >= 3:
                    year, quarter = path_parts[0], path_parts[1]
                else:
                    continue

                # Destination directory
                dest_dir = os.path.join(output_root, cik, year, quarter)
                os.makedirs(dest_dir, exist_ok=True)

                filename_only = os.path.basename(member)
                dest_path = os.path.join(dest_dir, filename_only)

                # ✅ Check if file already exists — if so, skip
                if os.path.exists(dest_path):
                    continue

                # Stream copy file from zip to disk
                with zipf.open(member) as src_file, open(dest_path, "wb") as dest_file:
                    shutil.copyfileobj(src_file, dest_file)

                cik_files[cik].append(dest_path)

                # 🆕 Save metadata: CIK, Year, Quarter, Filing Date, Filename
                filing_date = filename_only.split("_")[0]
                metadata.append((cik, year, quarter, filing_date, filename_only))

        pbar.update(1)

print(f"✅ Done! Extracted filings into '{output_root}'.")

# --- Save Metadata 🆕
meta_df = pd.DataFrame(metadata, columns=["CIK", "Year", "Quarter", "Filing_Date", "Filename"])
meta_df.to_csv("extracted_metadata.csv", index=False)

print(f"✅ Metadata saved to 'extracted_metadata.csv'.")

# --- Zip the entire output folder ---
zip_filename = f"{output_root}.zip"

print(f"📦 Creating archive '{zip_filename}'...")

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for foldername, subfolders, filenames in os.walk(output_root):
        for filename in filenames:
            file_path = os.path.join(foldername, filename)
            arcname = os.path.relpath(file_path, output_root)
            zipf.write(file_path, arcname)

print(f"✅ Archive created: '{zip_filename}'")

Processing zip files: 100%|██████████| 10/10 [13:20<00:00, 80.05s/it]


✅ Done! Extracted filings into 'sample_10k_data3'.
✅ Metadata saved to 'extracted_metadata.csv'.
📦 Creating archive 'sample_10k_data3.zip'...


KeyboardInterrupt: 

In [None]:
import os
import pandas as pd
import numpy as np
import shutil
import zipfile
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import statsmodels.formula.api as smf
import statsmodels.api as sm
from concurrent.futures import ThreadPoolExecutor
import pandas_datareader.famafrench as ff

# --- Step 0: Setup ---
text_folder = "sample_10k_data3"  # your extracted 10-Ks
sp500_file = "sp500_companies.csv"  # your downloaded S&P500 info

# Load S&P500 CIK list
firms = pd.read_csv(sp500_file)
firms['CIK'] = firms['CIK'].astype(str).str.zfill(10)
sp500_ciks_padded = set(firms['CIK'])
sp500_ciks_unpadded = set(cik.lstrip('0') for cik in sp500_ciks_padded)

filing_metadata = []

# --- Step 1: Build (Firm, Filing) Dataset ---

print("🔍 Scanning filings...")

folder_list = os.listdir(text_folder)  # 📂 preload all folders once
for cik_folder in tqdm(folder_list, desc="Scanning firms"):
    cik_path = os.path.join(text_folder, cik_folder)
    if not os.path.isdir(cik_path):
        continue

    if cik_folder not in sp500_ciks_unpadded:
        continue  # Only S&P500 firms

    for year_folder in os.listdir(cik_path):
        year_path = os.path.join(cik_path, year_folder)
        if not os.path.isdir(year_path):
            continue

        for quarter_folder in os.listdir(year_path):
            quarter_path = os.path.join(year_path, quarter_folder)
            if not os.path.isdir(quarter_path):
                continue

            for file in os.listdir(quarter_path):
                if file.endswith(".txt"):
                    filing_date = file.split("_")[0]  # YYYYMMDD
                    filing_metadata.append((cik_folder, filing_date, os.path.join(quarter_path, file)))

# Build filings dataframe
filings_df = pd.DataFrame(filing_metadata, columns=["CIK", "FilingDate", "Filepath"])
filings_df["FilingDate"] = pd.to_datetime(filings_df["FilingDate"], format="%Y%m%d")
filings_df = filings_df.sort_values(["CIK", "FilingDate"]).reset_index(drop=True)

print(f"✅ Found {filings_df['CIK'].nunique()} S&P500 companies with filings.")

# --- Step 2: Preload all filings into memory ---

print("📚 Preloading all filings into memory...")

file_contents = {}
for filepath in tqdm(filings_df["Filepath"], desc="Reading filings"):
    try:
        with open(filepath, 'r', errors='ignore') as f:
            text = f.read()
        file_contents[filepath] = text
    except Exception as e:
        tqdm.write(f"Error reading {filepath}: {e}")

# --- Step 3: Precompute TF-IDF vectors ---

print("⚡ Precomputing TF-IDF embeddings...")

vectorizer = TfidfVectorizer(max_features=3000)
all_texts = list(file_contents.values())
vectorizer.fit(all_texts)

tfidf_vectors = {}
for filepath, text in tqdm(file_contents.items(), desc="Vectorizing files"):
    try:
        tfidf = vectorizer.transform([text])
        tfidf_vectors[filepath] = tfidf
    except Exception as e:
        tqdm.write(f"TF-IDF error for {filepath}: {e}")

# --- Step 4: Calculate Cosine Similarities (Parallel) ---

print("⚡ Calculating cosine similarities in parallel...")

def calc_cosine_for_firm(cik, group):
    records = []
    if len(group) < 2:
        return records

    for i in range(1, len(group)):
        prev_file = group.iloc[i-1]["Filepath"]
        curr_file = group.iloc[i]["Filepath"]

        if prev_file not in tfidf_vectors or curr_file not in tfidf_vectors:
            continue

        tfidf1 = tfidf_vectors[prev_file]
        tfidf2 = tfidf_vectors[curr_file]

        cos_sim = cosine_similarity(tfidf1, tfidf2)[0][0]
        records.append((cik, group.iloc[i]["FilingDate"], cos_sim))
    return records

similarity_records = []

with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for cik, group in filings_df.groupby("CIK"):
        futures.append(executor.submit(calc_cosine_for_firm, cik, group))

    for future in tqdm(futures, desc="Firms for cosine"):
        result = future.result()
        similarity_records.extend(result)

similarity_df = pd.DataFrame(similarity_records, columns=["CIK", "FilingDate", "CosineSimilarity"])
similarity_df["FilingDate"] = pd.to_datetime(similarity_df["FilingDate"])

# --- Step 5: Build (Firm, Month) returns dataset (Dummy) ---

print("📈 Generating dummy returns...")

np.random.seed(0)
firm_months = []

for cik in tqdm(similarity_df["CIK"].unique(), desc="Firms for returns"):
    for year in range(2010, 2025):
        for month in range(1, 13):
            date = pd.Timestamp(year=year, month=month, day=1)
            ret = np.random.normal(0.01, 0.05)
            firm_months.append((cik, date, ret))

returns_df = pd.DataFrame(firm_months, columns=["CIK", "Month", "Return"])

# --- Step 6: Merge and Fill Quintiles ---

print("🔗 Merging similarity into returns and creating quintiles...")

returns_df["Month"] = pd.to_datetime(returns_df["Month"])
similarity_df["Year"] = similarity_df["FilingDate"].dt.year
similarity_df["Quintile"] = similarity_df.groupby("Year")["CosineSimilarity"].transform(
    lambda x: pd.qcut(x, 5, labels=[1,2,3,4,5])
)

returns_df["Year"] = returns_df["Month"].dt.year
returns_df = returns_df.merge(similarity_df[["CIK", "Year", "Quintile"]], on=["CIK", "Year"], how="left")

returns_df = returns_df.sort_values(["CIK", "Month"])
returns_df["Quintile"] = returns_df.groupby("CIK")["Quintile"].ffill(limit=5)

# --- Step 7: Aggregate to Portfolio Returns ---

print("📊 Building portfolio returns...")

returns_df = returns_df.dropna(subset=["Quintile"])
portfolio_returns = returns_df.groupby(["Month", "Quintile"])["Return"].mean().unstack()

portfolio_returns.columns = portfolio_returns.columns.astype(int)

if 5 in portfolio_returns.columns and 1 in portfolio_returns.columns:
    portfolio_returns["High-Low"] = portfolio_returns[5] - portfolio_returns[1]
    print("✅ High-Low portfolio computed successfully.")
else:
    print("⚠️ Warning: Quintile 5 or 1 missing, skipping High-Low portfolio.")

# --- Step 8: Pull Fama-French Factors and Run Regressions ---

print("📂 Downloading Fama-French factors...")

df_factors = ff.FamaFrenchReader('F-F_Research_Data_5_Factors_2x3', start='1900-01-01').read()[0]
mom = ff.FamaFrenchReader('F-F_Momentum_Factor', start='1900-01-01').read()[0]
mom.columns = ['Mom']
df_factors = pd.merge(df_factors, mom, left_index=True, right_index=True)

df_factors.index = pd.to_datetime(df_factors.index, format="%Y-%m")
reg_df = pd.merge(df_factors, portfolio_returns, left_index=True, right_index=True)

# Regression Models
factor_models = {
    'r^e': '1',
    'CAPM': 'Q("Mkt-RF")',
    'FF3': 'Q("Mkt-RF") + SMB + HML',
    'FF4': 'Q("Mkt-RF") + SMB + HML + Mom',
    'FF5': 'Q("Mkt-RF") + SMB + HML + RMW + CMA',
    'FF6': 'Q("Mkt-RF") + SMB + HML + RMW + CMA + Mom'
}

portfolios = [1,2,3,4,5,"High-Low"]

index = pd.MultiIndex.from_product([factor_models.keys(), ['alpha', 't-stat']], names=['Model', 'Metric'])
results = pd.DataFrame(index=index, columns=portfolios, dtype=float)

print("⚡ Running regressions...")

for portfolio in tqdm(portfolios, desc="Portfolios"):
    if portfolio not in reg_df.columns:
        continue
    for model_name, formula in factor_models.items():
        reg = smf.ols(formula=f'Q({portfolio}) ~ {formula}', data=reg_df).fit()
        alpha = reg.params['Intercept']
        t_stat = reg.tvalues['Intercept']
        results.at[(model_name, 'alpha'), portfolio] = alpha
        results.at[(model_name, 't-stat'), portfolio] = t_stat

print("✅ Done! Here are your results:")
display(results)

# --- Save final table ---
results.to_csv("final_portfolio_regressions.csv")
print("✅ Saved regression table to 'final_portfolio_regressions.csv'")