In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
ROOT_DIR = Path().resolve(strict=True).parent
DATA_DIR = f"{ROOT_DIR}/data/"

df = pd.read_csv(f"{DATA_DIR}/interim/new_features.csv", index_col=0)
# short_df = pd.read_csv(f"{DATA_DIR}/interim/new_features_drops.csv", index_col=0)

In [3]:
df.columns

Index(['Ticker', 'Fiscal Year', 'Revenue', 'Cost of Revenue', 'Net Income',
       'fcf', 'owners_earnings', 'roa', 'roe', 'roc', 'current_ratio',
       'quick_ratio', 'gross_margin', 'net_income_margin', 'fcf_margin',
       'owners_earnings_to_net_income', 'rd_to_net_income',
       'capex_to_net_income', 'net_income_growth_rate'],
      dtype='object')

In [4]:
df.loc[df['Ticker'] == df['Ticker'].shift(periods=-1), 'next_year_revenue'] = df['Revenue'].shift(periods=-1)

In [5]:
train_df = df[(df["Fiscal Year"] <= 2017) & (df["Fiscal Year"] >= 2010)].dropna(subset = ['next_year_revenue'])

train_df = train_df.dropna(subset = ['next_year_revenue'])

In [6]:
test_df = df[(df["Fiscal Year"] <= 2019) & (df["Fiscal Year"] >= 2018)]
test_df = test_df.dropna(subset = ['next_year_revenue'])

In [7]:
train_df = train_df.drop(columns=["Fiscal Year", "Ticker"])
test_df = test_df.drop(columns=["Fiscal Year", "Ticker"])

In [8]:
y_train=train_df['next_year_revenue']
X_train=train_df.drop(['next_year_revenue'],axis=1)

y_test=test_df['next_year_revenue']
X_test=test_df.drop(['next_year_revenue'],axis=1)

In [9]:
import xgboost

In [10]:
xg_classifier = xgboost.XGBRegressor()
xg_classifier.fit(X_train, y_train)

In [None]:
import pickle

: 

In [None]:
MODELS_DIR = f"{ROOT_DIR}/models/"
xg_classifier_file = f"{MODELS_DIR}xg_classifier.pkl"
pickle.dump(xg_classifier, open(xg_classifier_file, "wb"))

: 

In [11]:
y_pred = xg_classifier.predict(X_test)

In [12]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [18]:
mean_squared_error(y_test, y_pred)

471.97210848782345

In [13]:
mean_squared_error(y_test, y_pred)

894.076124917269

In [14]:
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
cross_val_score(xg_classifier, X_train, y_train, cv=kfold)

array([0.98714386, 0.95494821, 0.97265613, 0.97401824, 0.9851751 ,
       0.9499872 , 0.98469922, 0.99067378, 0.98761336, 0.98922217])

In [19]:
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
cross_val_score(xg_classifier, X_train, y_train, cv=kfold)

array([0.0329346 , 0.72470693, 0.75455996, 0.70680705, 0.70812848,
       0.71050177, 0.73185166, 0.77806023, 0.78244984, 0.74488603])

In [15]:
msft_df = df[df["Ticker"] == "MSFT"]
msft_df

Unnamed: 0,Ticker,Fiscal Year,Revenue,Cost of Revenue,Net Income,fcf,owners_earnings,roa,roe,roc,current_ratio,quick_ratio,gross_margin,net_income_margin,fcf_margin,owners_earnings_to_net_income,rd_to_net_income,capex_to_net_income,net_income_growth_rate,next_year_revenue
12171,MSFT,2001,252.96,-34.55,73.46,190.08,-49.12,0.12,0.16,0.2,4.24,3.81,0.86,0.29,0.75,-0.67,-0.6,0.0,12.03,283.65
12172,MSFT,2002,283.65,-51.91,78.29,190.81,-59.03,0.12,0.15,0.18,3.81,3.44,0.82,0.28,0.67,-0.75,-0.55,0.01,0.07,321.87
12173,MSFT,2003,321.87,-60.59,75.31,210.2,-74.84,0.09,0.12,0.12,4.22,3.88,0.81,0.23,0.65,-0.99,-0.88,0.01,-0.04,368.35
12174,MSFT,2004,368.35,-65.96,81.68,169.9,-47.6,0.09,0.11,0.1,4.71,4.44,0.82,0.22,0.46,-0.58,-0.95,-0.01,0.08,397.88
12175,MSFT,2005,397.88,-60.31,122.54,576.83,-415.69,0.17,0.25,0.21,2.89,2.66,0.85,0.31,1.45,-3.39,-0.5,-0.0,0.5,442.82
12176,MSFT,2006,442.82,-76.5,125.99,349.66,-250.89,0.18,0.31,0.24,2.18,1.94,0.83,0.28,0.79,-1.99,-0.52,-0.06,0.03,511.22
12177,MSFT,2007,511.22,-106.93,140.65,423.4,-308.22,0.22,0.45,0.29,1.69,1.46,0.79,0.28,0.83,-2.19,-0.51,-0.09,0.12,604.2
12178,MSFT,2008,604.2,-115.98,176.81,345.46,-196.32,0.24,0.49,0.31,1.45,1.25,0.81,0.29,0.57,-1.11,-0.46,-0.11,0.26,584.37
12179,MSFT,2009,584.37,-121.55,145.69,265.0,-184.98,0.19,0.37,0.26,1.82,1.58,0.79,0.25,0.45,-1.27,-0.62,-0.09,-0.18,624.84
12180,MSFT,2010,624.84,-123.95,187.6,373.64,-226.83,0.22,0.41,0.28,2.13,1.9,0.8,0.3,0.6,-1.21,-0.46,-0.01,0.29,699.43


In [16]:
to_pred = msft_df[msft_df["Fiscal Year"] == 2019].drop(columns=["Fiscal Year", "Ticker"])
next_year_revenue = to_pred.pop("next_year_revenue")
to_pred

Unnamed: 0,Revenue,Cost of Revenue,Net Income,fcf,owners_earnings,roa,roe,roc,current_ratio,quick_ratio,gross_margin,net_income_margin,fcf_margin,owners_earnings_to_net_income,rd_to_net_income,capex_to_net_income,net_income_growth_rate
12189,1258.43,-429.1,392.4,890.72,17.57,0.14,0.38,0.15,2.53,2.35,0.66,0.31,0.71,0.04,-0.43,-0.18,1.37


In [17]:
next_year_revenue

12189    1430.15
Name: next_year_revenue, dtype: float64

In [22]:
next_year_revenue

12189    442.81
Name: next_year_net_income, dtype: float64

In [18]:
pr = xg_classifier.predict(to_pred)

In [19]:
pr

array([1259.0944], dtype=float32)

In [24]:
pr

array([353.56732], dtype=float32)