In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("synnaxdata/train.csv")
test=pd.read_csv("synnaxdata/test.csv")

In [3]:
data.head()

Unnamed: 0,Id,industry,sector,fullTimeEmployees,auditRisk,boardRisk,compensationRisk,shareHolderRightsRisk,overallRisk,trailingPE,...,Q10_TOTAL_STOCKHOLDERS_EQUITY,Q10_NET_INCOME,Q10_GROSS_PROFIT,Q10_COST_OF_REVENUES,Q10_REVENUES,Q10_OPERATING_INCOME,Q10_OPERATING_EXPENSES,Q10_EBITDA,Q10_DEPRECIATION_AND_AMORTIZATION,Q10_fiscal_year_end
0,196,Personal Services,Consumer Cyclical,1174.0,5.0,10.0,9.0,4.0,8.0,12.549223,...,240502000.0,,23171000.0,54319000.0,40732000.0,2150000.0,5946000.0,17225000.0,15075000.0,0.0
1,1568,Building Products & Equipment,Industrials,3600.0,4.0,4.0,3.0,4.0,3.0,1222.0,...,326538000.0,-71929000.0,410574000.0,692688000.0,1103262000.0,-39287000.0,333229000.0,77345000.0,116632000.0,0.0
2,1218,,Unknown,,,,,,,,...,40663000000.0,603000000.0,1649000000.0,4587000000.0,6236000000.0,663000000.0,214000000.0,1435000000.0,772000000.0,0.0
3,23,Scientific & Technical Instruments,Technology,143.0,,,,,,,...,2758.0,-9715.0,0.0,,,-9683.0,9683.0,-9683.0,0.0,0.0
4,783,Drug Manufacturers - Specialty & Generic,Healthcare,36.0,,,,,,1.231544,...,25924000.0,-4676000.0,12050000.0,0.0,12050000.0,-1594000.0,13644000.0,-1594000.0,0.0,0.0


In [4]:
data.shape
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1624 entries, 0 to 1623
Columns: 212 entries, Id to Q10_fiscal_year_end
dtypes: float64(207), int64(1), object(4)
memory usage: 2.6+ MB


In [5]:
targets=[c for c in data.columns if c.startswith("Q0_")]
features=[c for c in data.columns if c not in targets+["Id"]]

In [6]:
def get_qnum(col):
    return int(col.split("_")[0][1:])

In [7]:
ebitda_columns=[c for c in data.columns
              if c.endswith("EBITDA") and c.startswith("Q") and not c.startswith("Q0_")]

ebitda_columns=sorted(ebitda_columns,key=get_qnum)

print(ebitda_columns)

['Q1_EBITDA', 'Q2_EBITDA', 'Q3_EBITDA', 'Q4_EBITDA', 'Q5_EBITDA', 'Q6_EBITDA', 'Q7_EBITDA', 'Q8_EBITDA', 'Q9_EBITDA', 'Q10_EBITDA']


In [8]:
if len(ebitda_columns)>0:
    data["EBITDA_LAST"]=data[ebitda_columns[0]]
    data["EBITDA_MEAN3"]=data[ebitda_columns[:3]].mean(axis=1)
    data["EBITDA_MEAN"]=data[ebitda_columns].mean(axis=1)

    if len(ebitda_columns)>=2:
        data["EBITDA_TREND"]=data[ebitda_columns[0]]-data[ebitda_columns[-1]]

In [9]:
revenue_columns=[c for c in data.columns
                 if "REVENUES" in c and "COST" not in c and c.startswith("Q") and not c.startswith("Q0_")]
revenue_columns=sorted(revenue_columns,key=get_qnum)
print(revenue_columns)

['Q1_REVENUES', 'Q2_REVENUES', 'Q3_REVENUES', 'Q4_REVENUES', 'Q5_REVENUES', 'Q6_REVENUES', 'Q7_REVENUES', 'Q8_REVENUES', 'Q9_REVENUES', 'Q10_REVENUES']


In [10]:
assets_columns=[c for c in data.columns
                if "TOTAL_ASSETS" in c and c.startswith("Q") and not c.startswith("Q0_")]
assets_columns=sorted(assets_columns,key=get_qnum)
print(assets_columns)

['Q1_TOTAL_ASSETS', 'Q2_TOTAL_ASSETS', 'Q3_TOTAL_ASSETS', 'Q4_TOTAL_ASSETS', 'Q5_TOTAL_ASSETS', 'Q6_TOTAL_ASSETS', 'Q7_TOTAL_ASSETS', 'Q8_TOTAL_ASSETS', 'Q9_TOTAL_ASSETS', 'Q10_TOTAL_ASSETS']


In [11]:
if len(ebitda_columns)>0 and len(revenue_columns)>0:
    data["EBITDA_MARGINLAST"]=data[ebitda_columns[0]]/(data[revenue_columns[0]]+1e-6)

if len(ebitda_columns)>0 and len(assets_columns)>0:
    data["EBITDA_TO_ASSETSLAST"]=data[ebitda_columns[0]]/(data[assets_columns[0]]+1e-6)

In [None]:
print(data[targets].isna().mean().sort_values(ascending=False).head(20))
print(data[features].isna().mean().sort_values(ascending=False).head(20))

In [None]:
data[targets].describe().T

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

for col in targets:
    plt.figure(figsize=(3,2))
    sns.histplot(data[col],bins=50,kde=True,color='blueviolet')
    plt.title(col)
    plt.show()

In [None]:
import numpy as np
for col in targets:
    plt.figure(figsize=(3,2))
    values=data[col].dropna()
    values=values[values>0]
    sns.histplot(np.log(values),bins=50,kde=True,color='blueviolet')
    plt.title(col+"(log)")
    plt.show()

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder

idcol=data["Id"]
numeric=data.select_dtypes(include=["number"]).columns.difference(targets).difference(idcol)
categorical=data.select_dtypes(include=["object","category"]).columns

numeric_imputer=SimpleImputer(strategy="median",add_indicator=True)
categorical_encoder=OneHotEncoder(handle_unknown="ignore")

In [None]:
includes_inf=np.isinf(data[numeric]).any()
print("Any information values?\n",includes_inf[includes_inf])

max_values=data[numeric].max().sort_values(ascending=False)
print(max_values.head(20))

In [13]:
import numpy as np 

data[numeric]=data[numeric].replace([np.inf,-np.inf],np.nan)

In [14]:
from sklearn.pipeline import Pipeline

preprocess=ColumnTransformer(
    transformers=[
        ("num",numeric_imputer,numeric),
        ("cat",categorical_encoder,categorical)])

In [15]:
from lightgbm import LGBMRegressor

base_model=LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42)

In [17]:
from sklearn.multioutput import MultiOutputRegressor

multi_model=Pipeline(
    steps=[
        ("preprocess",preprocess),
        ("regressor",MultiOutputRegressor(base_model))])

In [18]:
X_train=data[numeric.tolist()+categorical.tolist()]
y_train=data[targets]

In [19]:
multi_model.fit(X_train,y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005573 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 47614
[LightGBM] [Info] Number of data points in the train set: 1624, number of used features: 367
[LightGBM] [Info] Start training from score 4797690659.686576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009366 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 47614
[LightGBM] [Info] Number of data points in the train set: 1624, number of used features: 367
[LightGBM] [Info] Start training from score 3200071036.821043
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 47614
[LightGBM] [Info] Number of data points in the train set: 1624, number of used features: 367
[Light

In [20]:
from sklearn.model_selection import KFold,cross_val_score

kf=KFold(n_splits=5,shuffle=True,random_state=42)

scores=cross_val_score(
    multi_model,
    X_train,
    y_train,
    cv=kf,
    scoring="r2",
    n_jobs=-1)

print("R2 per fold:",scores)
print("Mean R2:",scores.mean())
print("Std R2:", scores.std())



R2 per fold: [0.7108924  0.59678129 0.7030768  0.7600643  0.51257999]
Mean R2: 0.6566789560980835
Std R2: 0.08959088015225836




In [None]:
from sklearn.base import clone
from sklearn.metrics import r2_score

X=X_train
y=y_train

all_r2=[]

for fold,(data,test) in enumerate(kf.split(X),start=1):
    X_tr,X_test=X.iloc[data],X.iloc[test]
    y_tr,y_test=y.iloc[data],y.iloc[test]

    model=clone(multi_model)
    model.fit(X_tr,y_tr)

    y_pred=model.predict(X_test)

    r2=r2_score(y_test,y_pred)
    all_r2.append(r2)

    print(f"Fold {fold}: R2={r2:.4f}")

print("Mean R2:",np.mean(all_r2))
print("Std R2:",np.std(all_r2))

In [None]:
r2_pertarget=r2_score(y_test,y_pred,multioutput="raw_values")
print(dict(zip(targets,r2_pertarget)))

In [21]:
teb="Q0_EBITDA"
y_ebitda=data[teb]

ebitda_model=Pipeline(
    steps=[
        ("preprocess",preprocess),
        ("model",LGBMRegressor(
            n_estimators=800,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42))])

In [22]:
ebitdascores=cross_val_score(
    ebitda_model,
    X_train,
    y_ebitda,
    cv=kf,
    scoring="r2",
    n_jobs=-1)

print("R2 per fold:",ebitdascores)
print("Mean R2:",ebitdascores.mean())
print("Std R2:",ebitdascores.std())



R2 per fold: [ 0.36720583  0.20935131 -0.15935422  0.59418954 -0.1382866 ]
Mean R2: 0.1746211707791568
Std R2: 0.2911300350744532




In [27]:
from sklearn.metrics import make_scorer,r2_score

y_ebitda_arcsinh=np.arcsinh(y_ebitda)

def r2_reverter(y_true_t,y_pred_t):
    y_true=np.sinh(y_true_t)
    y_pred=np.sinh(y_pred_t)
    return r2_score(y_true,y_pred)

r2_arcsinh_scorer=make_scorer(r2_reverter,greater_is_better=True)

asebitdascores=cross_val_score(
    ebitda_model,
    X_train,
    y_ebitda_arcsinh,
    cv=kf,
    scoring=r2_arcsinh_scorer,
    n_jobs=-1)

print("R2 per fold:",asebitdascores)
print("Mean R2:",asebitdascores.mean())
print("Std R2:",asebitdascores.std())



R2 per fold: [-1.42806271e+02 -1.08183872e+01 -7.69666395e-01 -2.51375624e+00
 -5.08605846e+04]
Mean R2: -10203.498539790682
Std R2: 20328.613690928618


