In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


def train_model(
    data_path="https://raw.githubusercontent.com/elleobrien/wine/master/wine_quality.csv",
    y_var="quality",
    split_ratio=0.2,
    seed = 42
):
    df = pd.read_csv(data_path)
    y = df.pop(y_var)
    X_train, X_test, y_train, y_test = train_test_split(
        df, y, test_size=split_ratio, random_state=seed
    )
    regr = RandomForestRegressor(max_depth=2, random_state=seed)
    regr.fit(X_train, y_train)
    train_score = regr.score(X_train, y_train) * 100
    test_score = regr.score(X_test, y_test) * 100
    importances = regr.feature_importances_
    labels = df.columns
    feature_df = pd.DataFrame(
        list(zip(labels, importances)), columns=["feature", "importance"]
    )
    feature_df = feature_df.sort_values(
        by="importance",
        ascending=False,
    )
    y_pred = regr.predict(X_test) + np.random.normal(0, 0.25, len(y_test))
    y_jitter = y_test + np.random.normal(0, 0.25, len(y_test))
    res_df = pd.DataFrame(list(zip(y_jitter, y_pred)), columns=["true", "pred"])
    return {"residuals_data": res_df.to_dict(orient="list"),"feature_importance_data": feature_df.to_dict(orient="list"),"train_score": train_score,"test_score": test_score}

In [6]:
model_results = train_model()


Unnamed: 0,true,pred
0,5.688060,5.159195
1,4.635269,5.254010
2,5.395350,5.832371
3,4.782045,5.137277
4,5.805896,5.403091
...,...,...
315,5.658027,5.584857
316,5.014099,5.156607
317,5.252232,4.898530
318,6.424065,6.223153


In [3]:
pd.DataFrame.from_dict(model_results['residuals_data'])