In [2]:
import numpy as np
import pandas as pd

from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler

import xgboost as xgb

# Reproducibility
np.random.seed(42)

# Display
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)


In [3]:
# =========================
# Block 2: Load & Clean Data
# =========================

# Load dataset
df = pd.read_excel(r"../Excel/top_30_mutual_funds_excel.xlsx")

# Columns we actually need
REQUIRED_COLS = [
    "scheme_name",
    "returns_5yr",
    "sharpe",
    "standard_deviation",
    "risk_bucket"
]

df = df[REQUIRED_COLS].copy()

# Convert to numeric safely
NUMERIC_COLS = ["returns_5yr", "sharpe", "standard_deviation"]
df[NUMERIC_COLS] = df[NUMERIC_COLS].apply(pd.to_numeric, errors="coerce")

# Drop rows with missing critical values
df.dropna(subset=NUMERIC_COLS + ["risk_bucket"], inplace=True)

df.reset_index(drop=True, inplace=True)

df.head()


Unnamed: 0,scheme_name,returns_5yr,sharpe,standard_deviation,risk_bucket
0,Quant Active Fund,19.9,1.87,18.235557,High Risk
1,Kotak Multi Asset Allocator FoF – Dynamic – Di...,15.3,1.91,5.786382,Moderate
2,Tata Digital India Fund,22.1,1.37,23.002947,High Risk
3,Kotak India Growth Fund,15.6,1.75,12.715432,High Risk
4,ICICI Pru Thematic Advantage Fund,14.6,1.63,12.548926,High Risk


In [4]:
# =========================
# Block 3.1: Z-score Features
# =========================

df["z_returns_5yr"] = zscore(df["returns_5yr"])
df["z_sharpe"] = zscore(df["sharpe"])

# Negative because LOWER volatility is better
df["z_std_dev"] = -zscore(df["standard_deviation"])

df[["z_returns_5yr", "z_sharpe", "z_std_dev"]].head()


Unnamed: 0,z_returns_5yr,z_sharpe,z_std_dev
0,0.953808,-0.38328,-0.631834
1,-0.270299,-0.179768,1.624767
2,1.539251,-2.927173,-1.495996
3,-0.190466,-0.993814,0.368772
4,-0.456576,-1.604349,0.398954


In [5]:
# =========================
# Block 3.2: Raw Target Score
# =========================

df["raw_target_score"] = (
    0.5 * df["z_returns_5yr"] +
    0.3 * df["z_sharpe"] +
    0.2 * df["z_std_dev"]
)

df[["raw_target_score"]].describe()


Unnamed: 0,raw_target_score
count,30.0
mean,-5.046889e-16
std,0.52967
min,-1.476445
25%,-0.3520723
50%,0.06927203
75%,0.3383265
max,0.8583837


In [6]:
# =========================
# Block 3.3: Target Score (Z-normalized)
# =========================

scaler = StandardScaler()
df["target_score_z"] = scaler.fit_transform(
    df[["raw_target_score"]]
)

df[["target_score_z"]].describe()


Unnamed: 0,target_score_z
count,30.0
mean,-1.0177040000000001e-17
std,1.017095
min,-2.835134
25%,-0.6760645
50%,0.1330191
75%,0.6496692
max,1.648305


In [7]:
# =========================
# Block 4: Risk Mapping
# =========================

RISK_MAP = {
    "Low": ["Low Risk"],
    "Moderately Low": ["Moderately Low", "Low Risk"],
    "Moderate": ["Moderate", "Moderately Low"],
    "High": ["High Risk"]
}

def filter_by_risk(df, user_risk):
    allowed_buckets = RISK_MAP.get(user_risk, [])
    return df[df["risk_bucket"].isin(allowed_buckets)].copy()


In [8]:
# =========================
# Block 5A: Feature Selection
# =========================

FEATURES = [
    "z_returns_5yr",
    "z_sharpe",
    "z_std_dev"
]

TARGET = "target_score_z"

# =========================
# Block 5B: Train XGBoost
# =========================

from xgboost import XGBRegressor

X = df[FEATURES]
y = df[TARGET]

model = XGBRegressor(
    n_estimators=200,
    max_depth=3,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X, y)


In [9]:
# =========================
# Block 6A: Risk Mapping
# =========================

RISK_MAP = {
    "Low": ["Low Risk"],
    "Moderate": ["Moderate", "Moderately Low"],
    "High": ["High Risk"]
}

# =========================
# Block 6B: Return Band Filter
# =========================

RETURN_BANDS = {
    "Low": (0, 12),
    "Moderate": (12, 18),
    "High": (18, 30)
}

# =========================
# Block 6C: Recommendation Engine
# =========================

def recommend_top_funds(
    df,
    user_risk="Moderate",
    return_band="Moderate",
    top_n=5
):
    # 1️⃣ Risk filter
    filtered = df[df["risk_bucket"].isin(RISK_MAP[user_risk])]
    
    # 2️⃣ Return band filter
    low, high = RETURN_BANDS[return_band]
    filtered = filtered[
        (filtered["returns_5yr"] >= low) &
        (filtered["returns_5yr"] <= high)
    ]
    
    # 3️⃣ Predict ML score
    filtered = filtered.copy()
    filtered["ml_score"] = model.predict(filtered[FEATURES])
    
    # 4️⃣ Rank
    return (
        filtered
        .sort_values("ml_score", ascending=False)
        .head(top_n)
        [["scheme_name", "ml_score", "returns_5yr", "risk_bucket"]]
    )


In [None]:
# =========================
# Block 7: Financial Projection + Interpretation
# =========================

# ---------- Financial Math ----------
def future_value_lumpsum(amount, annual_return, years):
    r = annual_return / 100
    return amount * ((1 + r) ** years)


def future_value_sip(monthly_investment, annual_return, years):
    r = annual_return / 100 / 12
    n = years * 12
    return monthly_investment * ((1 + r) ** n - 1) / r * (1 + r)


# ---------- Confidence Score ----------
def confidence_score(row):
    """
    Converts z-scores into a 0–100 confidence percentage.
    z_std_dev is already inverted (lower risk = higher score).
    """
    score = (
        0.45 * row["z_returns_5yr"] +
        0.35 * row["z_sharpe"] +
        0.20 * row["z_std_dev"]
    )

    confidence = 50 + (score * 15)
    return round(max(0, min(100, confidence)), 2)


# ---------- Explainability ----------
def explain_fund(row):
    explanation = []

    if row["z_returns_5yr"] > 0.75:
        explanation.append("Strong long-term returns")
    elif row["z_returns_5yr"] > 0:
        explanation.append("Above-average returns")
    else:
        explanation.append("Moderate returns")

    if row["z_sharpe"] > 0.5:
        explanation.append("Good risk-adjusted performance")
    elif row["z_sharpe"] > 0:
        explanation.append("Acceptable risk-adjusted performance")
    else:
        explanation.append("Risk-adjusted performance is volatile")

    if row["z_std_dev"] > 0:
        explanation.append("Lower-than-average volatility")
    else:
        explanation.append("Higher volatility expected")

    return ", ".join(explanation)


# ---------- Projection Engine ----------
def attach_projection(
    df,
    investment_type="SIP",
    amount=5000,
    years=10
):
    df = df.copy()

    if investment_type == "SIP":
        df["projected_value"] = df["returns_5yr"].apply(
            lambda r: future_value_sip(amount, r, years)
        )
    else:  # Lumpsum
        df["projected_value"] = df["returns_5yr"].apply(
            lambda r: future_value_lumpsum(amount, r, years)
        )

    df["investment_type"] = investment_type
    df["investment_years"] = years

    return df


# =========================
# FINAL PIPELINE EXECUTION
# =========================

# Step 1: Get ML-ranked funds
top_funds = recommend_top_funds(
    df,
    user_risk="High",
    return_band="High",
    top_n=5
)

# Step 2: Merge z-score features for interpretation
top_funds = top_funds.merge(
    df[["scheme_name", "z_returns_5yr", "z_sharpe", "z_std_dev"]],
    on="scheme_name",
    how="left"
)

# Step 3: Confidence + Explanation
top_funds["confidence_%"] = top_funds.apply(confidence_score, axis=1)
top_funds["why_recommended"] = top_funds.apply(explain_fund, axis=1)

# Step 4: Attach financial projection (LAST STEP)
final_output = attach_projection(
    top_funds,
    investment_type="SIP",
    amount=5000,
    years=10
)
# =========================
# COLUMN RENAMING
# =========================

final_output = final_output.rename(columns={
    "scheme_name": "Fund Name",
    "ml_score": "AI Ranking Score",
    "returns_5yr": "5Y Annual Return (%)",
    "risk_bucket": "Risk Level",
    "z_returns_5yr": "Return Strength (Z)",
    "z_sharpe": "Risk-Adjusted Strength (Z)",
    "z_std_dev": "Volatility Score (Z)",
    "confidence_%": "Confidence Level (%)",
    "why_recommended": "Why This Fund?",
    "projected_value": "Projected Value (₹)",
    "investment_type": "Investment Type",
    "investment_years": "Investment Duration (Years)"
})

final_output = final_output[
    [
        "Fund Name",
        "Risk Level",
        "AI Ranking Score",
        "Confidence Level (%)",
        "5Y Annual Return (%)",
        "Projected Value (₹)",
        "Investment Type",
        "Investment Duration (Years)",
        "Why This Fund?",
        "Return Strength (Z)",
        "Risk-Adjusted Strength (Z)",
        "Volatility Score (Z)"
    ]
]

def format_lakh_crore(value):
    if value >= 1e7:
        return f"₹{value/1e7:.2f} Cr"
    else:
        return f"₹{value/1e5:.2f} Lakhs"

final_output["Projected Value (₹)"] = final_output["Projected Value (₹)"].apply(format_lakh_crore)

final_output.insert(0, "Rank", range(1, len(final_output) + 1))

final_output = final_output.sort_values(
    "AI Ranking Score", ascending=False
).reset_index(drop=True)


final_output

Unnamed: 0,Rank,Fund Name,Risk Level,AI Ranking Score,Confidence Level (%),5Y Annual Return (%),Projected Value (₹),Investment Type,Investment Duration (Years),Why This Fund?,Return Strength (Z),Risk-Adjusted Strength (Z),Volatility Score (Z)
0,1,Quant Infrastructure Fund,High Risk,1.636971,63.27,21.1,₹20.54 Lakhs,SIP,10,"Strong long-term returns, Good risk-adjusted p...",1.273141,1.804468,-1.597636
1,2,Quant Absolute Fund,High Risk,1.370029,60.76,19.1,₹18.04 Lakhs,SIP,10,"Above-average returns, Good risk-adjusted perf...",0.74092,0.888667,0.365828
2,3,Quant Small Cap Fund,High Risk,1.333911,59.62,23.2,₹23.60 Lakhs,SIP,10,"Strong long-term returns, Good risk-adjusted p...",1.831972,0.837789,-2.380191
3,4,AXIS Small Cap Fund,High Risk,1.273561,59.72,19.4,₹18.39 Lakhs,SIP,10,"Strong long-term returns, Good risk-adjusted p...",0.820753,0.5834,0.371879
4,5,Quant Tax Plan- Direct Growth,High Risk,1.190353,58.78,22.2,₹22.08 Lakhs,SIP,10,"Strong long-term returns, Acceptable risk-adju...",1.565862,0.430766,-1.352001
