In [1]:
# long_term_recommender.py
"""
Simple long-term investment recommender.
- Builds training data from historical S&P500 tickers (Wikipedia table).
- Fetches price history + some fundamentals via yfinance.
- Labels: forward 3-year annualized return > THRESHOLD => good long-term invest.
- Trains a RandomForest classifier with time-based split.
- Exposes recommend_ticker(ticker) to return probability + recommendation.
"""
import warnings
warnings.filterwarnings("ignore")

import datetime as dt
import numpy as np
import pandas as pd
import yfinance as yf
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import joblib
import requests

# PARAMETERS
FORWARD_YEARS = 3
RETURN_THRESHOLD_ANNUAL = 0.08  # 8% annualized -> label=1 if forward annualized > 8%
MIN_HISTORY_YEARS = 5
TRAIN_CUTOFF_YEAR = 2018  # train on data before this year, test on >= this year
MODEL_PATH = "lt_recommender.joblib"

######## utilities ########
def get_sp500_tickers():
    # pulls the S&P500 table from Wikipedia
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    tables = pd.read_html(url)
    # first table is the constituents
    df = tables[0]
    return sorted(df['Symbol'].str.replace('.', '-', regex=False).tolist())

def download_price_history(ticker, start, end):
    # returns daily price df
    return yf.download(ticker, start=start, end=end, progress=False)

def get_info(ticker):
    t = yf.Ticker(ticker)
    return t.info if hasattr(t, "info") else {}

def compute_features_for_date(ticker, reference_date):
    """
    reference_date: pd.Timestamp or datetime. compute features looking BACK from this date.
    """
    end = reference_date
    start = end - pd.DateOffset(years=5)  # 5 years history for features
    hist = download_price_history(ticker, start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d"))
    if hist.empty or len(hist) < 30:
        return None
    # returns
    hist['ret'] = hist['Adj Close'].pct_change()
    # feature examples
    # 1-year and 3-year momentum using trailing returns (approx using business days)
    try:
        price_now = hist['Adj Close'].iloc[-1]
    except Exception:
        return None
    # get price 252 trading days (~1 year) ago and 252*3 for ~3 years, fallback to nearest
    def trailing_return(days):
        if len(hist) <= days: return np.nan
        return price_now / hist['Adj Close'].iloc[-(days+1)] - 1
    momentum_1y = trailing_return(252)
    momentum_3y = trailing_return(252*3)
    vol_1y = hist['ret'].std() * np.sqrt(252)
    avg_vol_1y = hist['Volume'].tail(252).mean()
    info = get_info(ticker)
    pe = info.get('trailingPE', np.nan)
    pb = info.get('priceToBook', np.nan) if 'priceToBook' in info else info.get('priceToSales', np.nan)
    div_yield = info.get('dividendYield', np.nan)
    marketcap = info.get('marketCap', np.nan)
    # basic features into a vector
    features = {
        'ticker': ticker,
        'date': pd.to_datetime(end),
        'momentum_1y': momentum_1y,
        'momentum_3y': momentum_3y,
        'vol_1y': vol_1y,
        'avg_vol_1y': avg_vol_1y,
        'pe': pe,
        'pb': pb,
        'div_yield': div_yield,
        'marketcap': marketcap
    }
    return features

def forward_annualized_return(ticker, reference_date, years=FORWARD_YEARS):
    start_date = pd.to_datetime(reference_date)
    end_date = start_date + pd.DateOffset(years=years)
    hist = download_price_history(ticker, start_date.strftime("%Y-%m-%d"), (end_date + pd.DateOffset(days=1)).strftime("%Y-%m-%d"))
    if hist.empty or len(hist) < 5:
        return np.nan
    price_start = hist['Adj Close'].iloc[0]
    price_end = hist['Adj Close'].iloc[-1]
    total_return = price_end / price_start - 1.0
    ann = (1+total_return) ** (1.0/years) - 1.0
    return ann

######## building dataset ########
def build_dataset(sample_tickers=None, start_year=2005, end_year=2019, step_months=12):
    """
    sample_tickers: list of tickers or None -> use S&P500 constituents.
    For each ticker and for each reference_date in range(start_year..end_year),
    compute features and label (forward 3-year ann return > threshold).
    """
    if sample_tickers is None:
        sample_tickers = get_sp500_tickers()
    rows = []
    for ticker in sample_tickers:
        # generate yearly reference dates from start_year to end_year
        for year in range(start_year, end_year+1):
            ref = pd.Timestamp(year=year, month=1, day=2)  # Jan 2 of each year as anchor
            features = compute_features_for_date(ticker, ref)
            if features is None:
                continue
            fwd = forward_annualized_return(ticker, ref, years=FORWARD_YEARS)
            if np.isnan(fwd):
                continue
            label = int(fwd > RETURN_THRESHOLD_ANNUAL)
            features['fwd_ann_return'] = fwd
            features['label'] = label
            rows.append(features)
    df = pd.DataFrame(rows)
    # drop rows with too many nans
    df = df.dropna(thresh=4)
    return df

######## training ########
def train_model(df):
    X = df[['momentum_1y','momentum_3y','vol_1y','avg_vol_1y','pe','pb','div_yield','marketcap']].fillna(0)
    y = df['label']
    # time-based split
    train_mask = df['date'].dt.year < TRAIN_CUTOFF_YEAR
    X_train, y_train = X[train_mask], y[train_mask]
    X_test, y_test = X[~train_mask], y[~train_mask]
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    probs = pipe.predict_proba(X_test)[:,1]
    print("TEST METRICS:")
    print(classification_report(y_test, preds))
    try:
        print("ROC AUC:", roc_auc_score(y_test, probs))
    except Exception:
        pass
    joblib.dump(pipe, MODEL_PATH)
    print("Saved model to", MODEL_PATH)
    return pipe

######## recommend function ########
def load_or_train_model(df=None):
    try:
        model = joblib.load(MODEL_PATH)
        print("Loaded model from", MODEL_PATH)
        return model
    except Exception:
        if df is None:
            raise RuntimeError("No saved model found; pass a dataset to train.")
        return train_model(df)

def recommend_ticker(ticker, model=None):
    # fetch features for today (or last market day)
    today = pd.Timestamp(dt.date.today())
    features = compute_features_for_date(ticker, today)
    if features is None:
        return {"error": "Not enough data for ticker"}
    X = pd.DataFrame([[
        features['momentum_1y'],
        features['momentum_3y'],
        features['vol_1y'],
        features['avg_vol_1y'],
        features['pe'],
        features['pb'],
        features['div_yield'],
        features['marketcap']
    ]], columns=['momentum_1y','momentum_3y','vol_1y','avg_vol_1y','pe','pb','div_yield','marketcap']).fillna(0)
    if model is None:
        model = joblib.load(MODEL_PATH)
    prob = model.predict_proba(X)[0,1]
    rec = "Invest (long-term)" if prob > 0.6 else ("Neutral" if prob > 0.45 else "Do not invest")
    return {
        "ticker": ticker,
        "probability_good_long_term": float(prob),
        "recommendation": rec,
        "features_snapshot": features
    }

######## example main ########
if __name__ == "__main__":
    print("Building dataset (this may take a while; you can limit tickers)...")
    # Warning: building the dataset for all S&P500 tickers is network heavy.
    # For quick testing: pass sample_tickers = ['AAPL','MSFT','AMZN','GOOGL','TSLA']
    df = build_dataset(sample_tickers=['AAPL','MSFT','AMZN','GOOGL','TSLA'],
                       start_year=2008, end_year=2018)
    print("Dataset rows:", len(df))
    model = train_model(df)
    # example recommend
    print("Example recommendation for AAPL:")
    print(recommend_ticker("AAPL", model=model))


Building dataset (this may take a while; you can limit tickers)...



1 Failed download:
['AAPL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')

1 Failed download:
['AAPL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')

1 Failed download:
['AAPL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')

1 Failed download:
['AAPL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')

1 Failed download:
['AAPL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')

1 Failed download:
['AAPL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')

1 Failed download:
['AAPL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')

1 Failed download:
['AAPL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')

1 Failed download:
['AAPL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')

1 Failed download:
['AAPL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')

1 Failed download:
['AAPL']: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')

Dataset rows: 0


KeyError: "None of [Index(['momentum_1y', 'momentum_3y', 'vol_1y', 'avg_vol_1y', 'pe', 'pb',\n       'div_yield', 'marketcap'],\n      dtype='object')] are in the [columns]"