In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:.3f}".format)


In [2]:
df = pd.read_excel("top_30_mutual_funds_excel.xlsx")
df.head()


Unnamed: 0,scheme_name,min_sip,min_lumpsum,expense_ratio,fund_size_cr,fund_age_yr,fund_manager,sortino_ratio,alpha,standard_deviation,beta,sharpe,risk_level,risk_bucket,amc_name,rating,category,sub_category,returns_1yr,returns_3yr,returns_5yr,risk-adjusted return score,cost efficiency score,consistency score,fund stability,composite_score
0,Quant Active Fund,1000,5000,0.58,3531,10,Sanjeev Sharma,3.33,13.82,18.236,0.97,1.87,6,High Risk,Quant Mutual Fund,0,Equity,Multi Cap Funds,1.0,45.5,19.9,0.07,1.724,23.25,2047.98,7.684
1,Kotak Multi Asset Allocator FoF – Dynamic – Di...,1000,5000,0.13,783,10,Devender Singhal,4.33,8.04,5.786,0.7,1.91,5,Moderate,Kotak Mahindra Mutual Fund,0,Other,FoFs Domestic,11.2,25.0,15.3,0.087,7.692,18.1,101.79,7.233
2,Tata Digital India Fund,150,5000,0.31,6765,7,Meeta Shetty,2.29,6.11,23.003,0.92,1.37,6,High Risk,Tata Mutual Fund,0,Equity,Sectoral / Thematic Mutual Funds,-16.0,39.0,22.1,0.151,3.226,11.5,2097.15,4.378
3,Kotak India Growth Fund,0,5000,0.34,90,5,Devender Singhal,3.71,7.67,12.715,0.95,1.75,6,High Risk,Kotak Mahindra Mutual Fund,0,Equity,Multi Cap Funds,4.8,35.5,15.6,0.124,2.941,20.15,30.6,6.691
4,ICICI Pru Thematic Advantage Fund,1000,5000,0.25,958,10,Dharmesh Kakkad,4.13,6.63,12.549,0.98,1.63,6,High Risk,ICICI Prudential Mutual Fund,0,Other,FoFs Domestic,5.4,35.4,14.6,0.148,4.0,20.4,239.5,6.864


In [3]:
df.shape
df.columns.tolist()


['scheme_name',
 'min_sip',
 'min_lumpsum',
 'expense_ratio',
 'fund_size_cr',
 'fund_age_yr',
 'fund_manager',
 'sortino_ratio',
 'alpha',
 'standard_deviation',
 'beta',
 'sharpe',
 'risk_level',
 'risk_bucket',
 'amc_name',
 'rating',
 'category',
 'sub_category',
 'returns_1yr',
 'returns_3yr',
 'returns_5yr',
 'risk-adjusted return score',
 'cost efficiency score',
 'consistency score',
 'fund stability',
 'composite_score']

In [4]:
df = df.rename(columns={
    "risk-adjusted return score": "risk_adjusted_return_score",
    "cost efficiency score": "cost_efficiency_score",
    "consistency score": "consistency_score",
    "fund stability": "fund_stability"
})


In [5]:
required_cols = [
    "scheme_name",
    "returns_1yr",
    "returns_3yr",
    "returns_5yr",
    "sharpe",
    "sortino_ratio",
    "standard_deviation",
    "beta",
    "alpha",
    "expense_ratio",
    "fund_size_cr",
    "fund_age_yr",
    "cost_efficiency_score",
    "consistency_score",
    "fund_stability",
    "risk_bucket",
    "category"
]

missing = set(required_cols) - set(df.columns)
assert not missing, f"Missing columns: {missing}"


In [6]:
FEATURES = [
    "returns_1yr",
    "returns_3yr",
    "returns_5yr",
    "sharpe",
    "sortino_ratio",
    "standard_deviation",
    "beta",
    "alpha",
    "expense_ratio",
    "fund_size_cr",
    "fund_age_yr",
    "cost_efficiency_score",
    "consistency_score",
    "fund_stability"
]

X = df[FEATURES].copy()


In [7]:
X.describe()


Unnamed: 0,returns_1yr,returns_3yr,returns_5yr,sharpe,sortino_ratio,standard_deviation,beta,alpha,expense_ratio,fund_size_cr,fund_age_yr,cost_efficiency_score,consistency_score,fund_stability
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,7.263,41.147,16.316,1.945,4.069,14.75,0.869,11.373,0.551,6264.967,8.733,2.713,24.205,3989.24
std,7.34,10.867,3.822,0.2,0.926,5.611,0.309,5.019,0.246,7518.172,2.132,2.552,5.929,5681.525
min,-16.0,14.5,9.491,1.37,2.29,1.452,0.53,5.29,0.08,89.0,4.0,1.0,11.5,7.12
25%,3.75,35.425,13.925,1.88,3.598,12.703,0.75,7.762,0.37,971.75,9.0,1.357,20.6,576.84
50%,7.5,42.2,15.6,1.945,4.02,13.88,0.84,9.94,0.58,3300.5,10.0,1.724,24.0,2072.565
75%,11.05,45.0,19.325,2.075,4.405,17.215,0.917,14.068,0.738,8547.75,10.0,2.749,27.975,4748.167
max,23.2,71.4,23.2,2.3,7.27,27.881,2.36,27.24,1.0,29953.0,10.0,12.5,38.4,22464.75


In [8]:
from scipy.stats import zscore

df["z_returns_5yr"] = zscore(df["returns_5yr"])
df["z_sharpe"] = zscore(df["sharpe"])
df["z_std_dev"] = -zscore(df["standard_deviation"])  # negative: lower risk is better


In [10]:
df["target_score"] = (
    0.5 * df["z_returns_5yr"] +
    0.3 * df["z_sharpe"] +
    0.2 * df["z_std_dev"]
)


In [11]:
df[["returns_5yr", "sharpe", "standard_deviation", "target_score"]].corr()


Unnamed: 0,returns_5yr,sharpe,standard_deviation,target_score
returns_5yr,1.0,-0.004,0.488,0.77
sharpe,-0.004,1.0,0.082,0.54
standard_deviation,0.488,0.082,1.0,0.132
target_score,0.77,0.54,0.132,1.0


In [12]:
RISK_MAP = {
    "Low": ["Low Risk", "Moderately Low"],
    "Moderate": ["Moderate", "Moderately Low"],
    "High": ["High Risk", "Moderate"]
}


In [13]:
def filter_by_risk(df, user_risk):
    return df[df["risk_bucket"].isin(RISK_MAP[user_risk])]


In [14]:
X_train = df[FEATURES]
y_train = df["target_score"]
group = np.ones(len(df))  # single group (small dataset)


In [16]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
    --------------------------------------- 1.0/72.0 MB 5.0 MB/s eta 0:00:15
   - -------------------------------------- 2.1/72.0 MB 4.7 MB/s eta 0:00:15
   - -------------------------------------- 2.9/72.0 MB 4.5 MB/s eta 0:00:16
   -- ------------------------------------- 3.9/72.0 MB 4.8 MB/s eta 0:00:15
   -- ------------------------------------- 5.0/72.0 MB 4.7 MB/s eta 0:00:15
   --- ------------------------------------ 6.0/72.0 MB 4.9 MB/s eta 0:00:14
   --- ------------------------------------ 6.8/72.0 MB 4.7 MB/s eta 0:00:14
   ---- ----------------------------------- 8.1/72.0 MB 4.8 MB/s eta 0:00:14
   ---- ----------------------------------- 8.9/72.0 MB 4.7 MB/s eta 0:00:14
   ----- ---------------------------------- 10.0/72.0 MB 4.7 MB/s eta 0:00:14
   ------ --


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [28]:
from xgboost import XGBRanker

model = XGBRanker(
    objective="rank:pairwise",
    learning_rate=0.05,
    n_estimators=200,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train, group=group)


In [30]:
def recommend_top_funds(df, user_risk, top_n=5):
    filtered = filter_by_risk(df, user_risk)
    
    scores = model.predict(filtered[FEATURES])
    filtered = filtered.copy()
    filtered["final_score"] = scores
    
    return (
        filtered
        .sort_values("final_score", ascending=False)
        .head(top_n)[
            ["scheme_name", "final_score", "returns_5yr", "risk_bucket"]
        ]
    )


In [34]:
recommend_top_funds(df, user_risk="Moderate", top_n=5)


Unnamed: 0,scheme_name,final_score,returns_5yr,risk_bucket
1,Kotak Multi Asset Allocator FoF – Dynamic – Di...,0.0,15.3,Moderate
14,ICICI Pru Asset Allocator Fund,0.0,12.8,Moderately Low


In [20]:
def sip_future_value(monthly_sip, annual_cagr, years):
    """
    monthly_sip: amount invested every month
    annual_cagr: expected CAGR (%)
    years: investment duration
    """
    r = annual_cagr / 100 / 12
    n = years * 12

    fv = monthly_sip * (((1 + r) ** n - 1) / r) * (1 + r)
    return round(fv, 2)


In [21]:
def lumpsum_future_value(amount, annual_cagr, years):
    r = annual_cagr / 100
    fv = amount * ((1 + r) ** years)
    return round(fv, 2)


In [22]:
def add_projection(
    df_reco,
    investment_type,
    amount,
    years
):
    df = df_reco.copy()

    if investment_type == "SIP":
        df["projected_value"] = df["returns_5yr"].apply(
            lambda r: sip_future_value(amount, r, years)
        )
    else:
        df["projected_value"] = df["returns_5yr"].apply(
            lambda r: lumpsum_future_value(amount, r, years)
        )

    df["investment_type"] = investment_type
    df["investment_years"] = years

    return df


In [23]:
top_funds = recommend_top_funds(
    df,
    user_risk="Moderate",
    top_n=5
)

final_output = add_projection(
    top_funds,
    investment_type="SIP",
    amount=5000,
    years=10
)

final_output


Unnamed: 0,scheme_name,final_score,returns_5yr,risk_bucket,projected_value,investment_type,investment_years
1,Kotak Multi Asset Allocator FoF – Dynamic – Di...,0.0,15.3,Moderate,1419330.14,SIP,10
14,ICICI Pru Asset Allocator Fund,0.0,12.8,Moderately Low,1218647.18,SIP,10
