In [None]:
import re
import json
from datasets import load_dataset

ds = load_dataset("winterForestStump/10-K_sec_filings", streaming=True)
split = ds["001"]


In [None]:
#First Function identifying the industry name which lies in business section of data

def classify_industry(business_text):

    text = business_text.lower()

    if any(word in text for word in ["apparel", "clothing", "fashion", "textile", "garment"]):
        return "Apparel & Textiles"
    elif any(word in text for word in ["software", "technology", "computer", "internet", "digital"]):
        return "Technology"
    elif any(word in text for word in ["bank", "financial", "insurance", "investment", "credit"]):
        return "Financial Services"
    elif any(word in text for word in ["retail", "store", "merchandise", "shopping"]):
        return "Retail"
    elif any(word in text for word in ["pharmaceutical", "drug", "biotech", "medical device"]):
        return "Healthcare & Pharma"
    elif any(word in text for word in ["energy", "oil", "gas", "petroleum", "electric"]):
        return "Energy"
    elif any(word in text for word in ["food", "beverage", "restaurant", "dining"]):
        return "Food & Beverage"
    elif any(word in text for word in ["manufacture", "manufacturing", "industrial", "machinery"]):
        return "Manufacturing"
    elif any(word in text for word in ["real estate", "property", "construction"]):
        return "Real Estate"
    elif any(word in text for word in ["telecommunications", "telecom", "wireless", "communication"]):
        return "Telecommunications"
    else:
        return "Other"

In [None]:
#in order to extract numbers first cleaning the data ..
def clean_text(text):
    text = text.replace("$", "")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
#Extracting year
def get_years(text):

    years = re.findall(r"(19\d{2}|20\d{2})", text)
    unique_years = []
    for y in years:
        if y not in unique_years:
            unique_years.append(int(y))
        if len(unique_years) == 5:
            break
    return unique_years

The following function to match patterns and extract the desired metric from the data

In [None]:
def get_metric_values(text, metric_name, years):

    pattern = rf"{re.escape(metric_name)}\s+([\d,]+(?:\.\d+)?)\s+([\d,]+(?:\.\d+)?)\s+([\d,]+(?:\.\d+)?)\s+([\d,]+(?:\.\d+)?)\s+([\d,]+(?:\.\d+)?)"
    match = re.search(pattern, text, re.IGNORECASE)

    if not match:
        return None

    values = []
    for v in match.groups():
        try:
            num = float(v.replace(",", ""))
            values.append(num)
        except:
            return None

    return values

In [None]:
def calculate_simple_kpis(metrics_dict):

    kpis = {}

    #Profit Margin
    if "Net sales" in metrics_dict and "Net income" in metrics_dict:
        net_sales = metrics_dict["Net sales"]
        net_income = metrics_dict["Net income"]
        if net_sales > 0:
            kpis["Profit_Margin_%"] = round((net_income / net_sales) * 100, 2)

    #ROA
    if "Net income" in metrics_dict and "Total assets" in metrics_dict:
        net_income = metrics_dict["Net income"]
        total_assets = metrics_dict["Total assets"]
        if total_assets > 0:
            kpis["ROA_%"] = round((net_income / total_assets) * 100, 2)

    #Debt Ratio
    if "Long-term debt and obligations under capital leases" in metrics_dict and "Total assets" in metrics_dict:
        debt = metrics_dict["Long-term debt and obligations under capital leases"]
        total_assets = metrics_dict["Total assets"]
        if total_assets > 0:
            kpis["Debt_Ratio_%"] = round((debt / total_assets) * 100, 2)

    return kpis


In [None]:
def process_one_company(row):

   # first getting industry name contained in business section
    business_text = row.get("Business", "")
    if not business_text:
        return None

    industry = classify_industry(business_text)

    # gettingg financial text
    financial_text = row.get("Selected Financial Data", "")
    if not financial_text:
        return None

    # cleaning text
    clean = clean_text(financial_text)

    # getting years
    years = get_years(clean)
    if len(years) == 0:
        return None

    latest_year = years[0]

    # metrics we want
    metrics_to_find = [
        "Net sales",
        "Net income",
        "Total assets",
        "Long-term debt and obligations under capital leases"
    ]

    # extracting values for latest year
    latest_metrics = {}
    for metric in metrics_to_find:
        values = get_metric_values(clean, metric, years)
        if values:
            latest_metrics[metric] = values[0]

    if len(latest_metrics) < 2:
        return None

    # calculating KPIs
    kpis = calculate_simple_kpis(latest_metrics)

    # creating company profile WITH INDUSTRY
    company = {
        "company_name": row.get("company_name", "Unknown"),
        "cik": row.get("cik", ""),
        "industry": industry,
        "year": latest_year,
        "metrics": latest_metrics,
        "kpis": kpis
    }

    return company

In [None]:
all_companies = []
count = 0

for row in split:
    company = process_one_company(row)

    if company:
        all_companies.append(company)
        count += 1


    if count >= 300:
        break




with open("companies_with_industry.json", "w") as f:
    json.dump(all_companies, f, indent=2)

In [None]:

industry_counts = {}
for company in all_companies:
    ind = company["industry"]
    industry_counts[ind] = industry_counts.get(ind, 0) + 1

for industry, count in sorted(industry_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{industry}: {count} companies")


for i, company in enumerate(all_companies[:3]):
    print(f"\n{i+1}. {company['company_name']}")
    print(f"   Industry: {company['industry']}")
    print(f"   Year: {company['year']}")
    print(f"   Total Assets: ${company['metrics'].get('Total assets', 0):,.0f}")
    print(f"   KPIs: {company['kpis']}")

Technology: 155 companies
Financial Services: 84 companies
Apparel & Textiles: 46 companies
Other: 8 companies
Energy: 4 companies
Retail: 2 companies
Telecommunications: 1 companies

1. TROPICAL SPORTSWEAR INTERNATIONAL CORP
   Industry: Apparel & Textiles
   Year: 1998
   Total Assets: $297,476
   KPIs: {'Profit_Margin_%': 4.09, 'ROA_%': 3.63, 'Debt_Ratio_%': 57.65}

2. IEC ELECTRONICS CORP
   Industry: Technology
   Year: 1998
   Total Assets: $98,665
   KPIs: {}

3. IEC ELECTRONICS CORP
   Industry: Technology
   Year: 1998
   Total Assets: $98,665
   KPIs: {}


In [None]:
import json


with open("companies_with_industry.json", "r") as f:
    all_companies = json.load(f)

print("done loading")
#companies with less or no financial data
def has_some_kpis(company):

    return len(company["kpis"]) > 0

complete_companies = [c for c in all_companies if has_some_kpis(c)]
print(f"Found {len(complete_companies)} companies with KPI data")


done loading
Found 211 companies with KPI data


In [None]:
#Comparision
def calculate_size_similarity(company1, company2):


    assets1 = company1["metrics"].get("Total assets", 0)
    assets2 = company2["metrics"].get("Total assets", 0)


    if assets1 == 0 or assets2 == 0:
        return 0


    larger = max(assets1, assets2)
    smaller = min(assets1, assets2)
    ratio = smaller / larger



    points = ratio * 40

    return round(points, 2)


In [None]:
#Comaprision of KPIs
def compare_one_kpi(value1, value2, threshold_small, threshold_medium, threshold_large):



    difference = abs(value1 - value2)


    if difference < threshold_small:
        return 8.5
    elif difference < threshold_medium:
        return 7.5
    elif difference < threshold_large:
        return 5.0
    else:
        return 2.5


def calculate_kpi_similarity(company1, company2):


    score = 0
    kpis1 = company1["kpis"]
    kpis2 = company2["kpis"]

    #profit margin
    if "Profit_Margin_%" in kpis1 and "Profit_Margin_%" in kpis2:
        pm1 = kpis1["Profit_Margin_%"]
        pm2 = kpis2["Profit_Margin_%"]

        score += compare_one_kpi(pm1, pm2, 2, 5, 10)

    #ROA
    if "ROA_%" in kpis1 and "ROA_%" in kpis2:
        roa1 = kpis1["ROA_%"]
        roa2 = kpis2["ROA_%"]

        score += compare_one_kpi(roa1, roa2, 2, 5, 10)

    #debt ratio
    if "Debt_Ratio_%" in kpis1 and "Debt_Ratio_%" in kpis2:
        debt1 = kpis1["Debt_Ratio_%"]
        debt2 = kpis2["Debt_Ratio_%"]

        score += compare_one_kpi(debt1, debt2, 5, 10, 20)

    #asset turnover
    if "Asset_Turnover" in kpis1 and "Asset_Turnover" in kpis2:
        at1 = kpis1["Asset_Turnover"]
        at2 = kpis2["Asset_Turnover"]

        score += compare_one_kpi(at1, at2, 0.2, 0.5, 1.0)

    #Working capital ratio
    if "Working_Capital_Ratio_%" in kpis1 and "Working_Capital_Ratio_%" in kpis2:
        wc1 = kpis1["Working_Capital_Ratio_%"]
        wc2 = kpis2["Working_Capital_Ratio_%"]

        score += compare_one_kpi(wc1, wc2, 5, 10, 20)

    #Equity
    if "Equity_Ratio_%" in kpis1 and "Equity_Ratio_%" in kpis2:
        eq1 = kpis1["Equity_Ratio_%"]
        eq2 = kpis2["Equity_Ratio_%"]

        score += compare_one_kpi(eq1, eq2, 5, 10, 20)

    #ROE
    if "ROE_%" in kpis1 and "ROE_%" in kpis2:
        roe1 = kpis1["ROE_%"]
        roe2 = kpis2["ROE_%"]

        score += compare_one_kpi(roe1, roe2, 3, 7, 15)

    return round(score, 2)

In [None]:
def calculate_total_similarity(company1, company2):


    size_score = calculate_size_similarity(company1, company2)
    kpi_score = calculate_kpi_similarity(company1, company2)
    total = size_score + kpi_score

    return {
        "total": round(total, 2),
        "size_score": size_score,
        "kpi_score": kpi_score
    }


In [None]:
#firstby same industry then similar KPIs
def find_peers(new_company, all_companies, top_n=5):


    new_industry = new_company["industry"]

    same_industry = [
        c for c in all_companies
        if c["industry"] == new_industry
        and c["company_name"] != new_company["company_name"]
    ]

    print(f"\n Total companies inn '{new_industry}' industry: {len(same_industry)}")

    if len(same_industry) == 0:
        print("No peers in the industry ")
        return []

    # Calculate similarity for each peer
    peers = []
    for company in same_industry:
        similarity = calculate_total_similarity(new_company, company)

        peers.append({
            "company_name": company["company_name"],
            "industry": company["industry"],
            "total_similarity": similarity["total"],
            "size_similarity": similarity["size_score"],
            "kpi_similarity": similarity["kpi_score"],
            "total_assets": company["metrics"].get("Total assets", 0),
            "profit_margin": company["kpis"].get("Profit_Margin_%", "N/A"),
            "roa": company["kpis"].get("ROA_%", "N/A"),
            "debt_ratio": company["kpis"].get("Debt_Ratio_%", "N/A")
        })


    peers.sort(key=lambda x: x["total_similarity"], reverse=True)

    return peers[:top_n]

In [None]:
# using first company as sample
sample_company = complete_companies[0]

print(f"\n NEW COMPANY: {sample_company['company_name']}")
print(f"   Industry: {sample_company['industry']}")
print(f"   Total Assets: ${sample_company['metrics']['Total assets']:,.0f}")
print(f"   Profit Margin: {sample_company['kpis'].get('Profit_Margin_%', 'N/A')}%")
print(f"   ROA: {sample_company['kpis'].get('ROA_%', 'N/A')}%")
print(f"   Debt Ratio: {sample_company['kpis'].get('Debt_Ratio_%', 'N/A')}%")

# Find peers
peers = find_peers(sample_company, complete_companies, top_n=5)

if peers:


    for i, peer in enumerate(peers, 1):
        print(f"\n{i}. {peer['company_name']}")
        print(f"   Total Similarity: {peer['total_similarity']}/100")
        print(f"     - Size Similarity: {peer['size_similarity']}/30")
        print(f"     - KPI Similarity: {peer['kpi_similarity']}/70")
        print(f"   Total Assets: ${peer['total_assets']:,.0f}")
        print(f"   Profit Margin: {peer['profit_margin']}%")
        print(f"   ROA: {peer['roa']}%")
        print(f"   Debt Ratio: {peer['debt_ratio']}%")


peer_analysis = {
    "new_company": {
        "name": sample_company["company_name"],
        "industry": sample_company["industry"],
        "metrics": sample_company["metrics"],
        "kpis": sample_company["kpis"]
    },
    "peers": peers
}

with open("peer_analysis_with_size.json", "w") as f:
    json.dump(peer_analysis, f, indent=2)




 NEW COMPANY: TROPICAL SPORTSWEAR INTERNATIONAL CORP
   Industry: Apparel & Textiles
   Total Assets: $297,476
   Profit Margin: 4.09%
   ROA: 3.63%
   Debt Ratio: 57.65%

 Total companies inn 'Apparel & Textiles' industry: 25

1. COLUMBIA SPORTSWEAR CO
   Total Similarity: 48.74/100
     - Size Similarity: 36.24/30
     - KPI Similarity: 12.5/70
   Total Assets: $269,478
   Profit Margin: 7.66%
   ROA: 12.15%
   Debt Ratio: N/A%

2. OLD DOMINION FREIGHT LINE INC/VA
   Total Similarity: 41.01/100
     - Size Similarity: 32.51/30
     - KPI Similarity: 8.5/70
   Total Assets: $241,799
   Profit Margin: N/A%
   ROA: 4.6%
   Debt Ratio: N/A%

3. AMCOL INTERNATIONAL CORP
   Total Similarity: 40.75/100
     - Size Similarity: 33.25/30
     - KPI Similarity: 7.5/70
   Total Assets: $357,864
   Profit Margin: N/A%
   ROA: 6.17%
   Debt Ratio: N/A%

4. FAB INDUSTRIES INC
   Total Similarity: 38.57/100
     - Size Similarity: 21.57/30
     - KPI Similarity: 17.0/70
   Total Assets: $160,403
  