In [1]:
%load_ext autoreload
%autoreload 2

In [100]:
import pandas as pd
import numpy as np

In [167]:
from Company_Valuation.trainer import Trainer
from Company_Valuation.data import get_data, clean_data, holdout
from Company_Valuation.utils import vectorize

In [None]:
# Scalers
from sklearn.preprocessing import RobustScaler
# Models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
# Optimisation
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV

# Basic Models

In [156]:
# Scalers
r_scaler = RobustScaler()
# Models
lin_reg = LinearRegression()
ridge = Ridge()
lasso = Lasso()
knn = KNeighborsRegressor()
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()
ada = AdaBoostRegressor()
svr = SVR()
xgb = XGBRegressor()

In [150]:
trainer = Trainer()

In [159]:
models = [lin_reg, ridge, lasso, knn, rf, gb, ada, svr, xgb]
model_names = ['lin_reg', 'ridge', 'lasso', 'knn', 'rf', 'gb', 'ada', 'svr', 'xgb']
for i in range(len(models)):
    print(f'Score for {model_names[i]}:')
    pipe = trainer.train(models[i])

Score for lin_reg:
Cross Validated RMSE = 1111729.5496765822
Score for ridge:
Cross Validated RMSE = 15735.881244311855
Score for lasso:
Cross Validated RMSE = 15790.265524798926
Score for knn:
Cross Validated RMSE = 19526.807692390863
Score for rf:
Cross Validated RMSE = 11389.349961603324
Score for gb:
Cross Validated RMSE = 11151.68627058389
Score for ada:
Cross Validated RMSE = 22761.25973370383
Score for svr:
Cross Validated RMSE = 24423.765822392415
Score for xgb:
Cross Validated RMSE = 11410.67027239213


In [161]:
models = [lin_reg, ridge, lasso, knn, rf, gb, ada, svr, xgb]
model_names = ['lin_reg', 'ridge', 'lasso', 'knn', 'rf', 'gb', 'ada', 'svr', 'xgb']
for i in range(len(models)):
    print(f'Score for {model_names[i]}:')
    pipe = trainer.train(models[i], remove_features=['description'])

Score for lin_reg:
Training without ['description']
Cross Validated RMSE = 15727.942948490663
Score for ridge:
Training without ['description']
Cross Validated RMSE = 15727.906407395392
Score for lasso:
Training without ['description']
Cross Validated RMSE = 15727.981347507857
Score for knn:
Training without ['description']
Cross Validated RMSE = 19526.35193766989
Score for rf:
Training without ['description']
Cross Validated RMSE = 11162.445910476881
Score for gb:
Training without ['description']
Cross Validated RMSE = 11203.229262009605
Score for ada:
Training without ['description']
Cross Validated RMSE = 19304.239963543343
Score for svr:
Training without ['description']
Cross Validated RMSE = 24423.640963518574
Score for xgb:
Training without ['description']
Cross Validated RMSE = 12142.33597680596


# Feature Permutation

In [176]:
df = get_data()
df = clean_data(df)
df = df.drop(columns=['description'])
X_train, X_test, y_train, y_test = holdout(df)

In [178]:
rf_pipe = trainer.train(rf, remove_features=['description'])

permutation_score = permutation_importance(rf_pipe, X_train, y_train, n_repeats=100) # Perform Permutation

Training without ['description']
Cross Validated RMSE = 11188.604848329916


In [177]:
importance_df = pd.DataFrame(np.vstack((X_train.columns,
                                        permutation_score.importances_mean)).T) # Unstack results
importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False) 

Training without ['description']
Cross Validated RMSE = 11022.421831103338


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 10 and the array at index 1 has size 9

# Hyperparameter Optimisation

In [None]:
hyperparams = {}

def grid_search(model, params):
    search = GridSearchCV(model, params, scoring='neg_mean_squared_error', cv=5, verbose=1)
    search.fit(X_train, y_train)
    hyperparams[search.best_params_] = search.best_score_