In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('50_Startups.csv')

In [3]:
dataset = pd.get_dummies(dataset, dtype = int, drop_first = True)

In [4]:
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'Profit',
       'State_Florida', 'State_New York'],
      dtype='object')

In [5]:
independent = dataset[['R&D Spend', 'Administration', 'Marketing Spend', 'State_Florida', 'State_New York']]

In [6]:
dependent = dataset[['Profit']]

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size = 0.3, random_state = 0)

In [9]:
test_scenarios = [
    {
        'test_id' : 'Test001',
        'n_estimators' : 100, 
        'criterion' : 'squared_error',
        'max_features' : 'sqrt'
    },
    {
        'test_id' : 'Test002',
        'n_estimators' : 100, 
        'criterion' : 'squared_error',
        'max_features' : 'log2'
    },
    {
        'test_id' : 'Test003',
        'n_estimators' : 100, 
        'criterion' : 'squared_error',
        'max_features' : None
    },
    {
        'test_id' : 'Test004',
        'n_estimators' : 100, 
        'criterion' : 'absolute_error',
        'max_features' : 'sqrt'
    },{
        'test_id' : 'Test005',
        'n_estimators' : 100, 
        'criterion' : 'absolute_error',
        'max_features' : 'log2'
    },
    {
        'test_id' : 'Test006',
        'n_estimators' : 100, 
        'criterion' : 'absolute_error',
        'max_features' : None
    },
    {
        'test_id' : 'Test007',
        'n_estimators' : 100, 
        'criterion' : 'friedman_mse',
        'max_features' : 'sqrt'
    },{
        'test_id' : 'Test008',
        'n_estimators' : 100, 
        'criterion' : 'friedman_mse',
        'max_features' : 'log2'
    },
    {
        'test_id' : 'Test009',
        'n_estimators' : 100, 
        'criterion' : 'friedman_mse',
        'max_features' : None
    },
    {
        'test_id' : 'Test010',
        'n_estimators' : 100, 
        'criterion' : 'poisson',
        'max_features' : 'sqrt'
    },{
        'test_id' : 'Test011',
        'n_estimators' : 100, 
        'criterion' : 'poisson',
        'max_features' : 'log2'
    },
    {
        'test_id' : 'Test012',
        'n_estimators' : 100, 
        'criterion' : 'poisson',
        'max_features' : None
    },


    {
        'test_id' : 'Test001',
        'n_estimators' : 50, 
        'criterion' : 'squared_error',
        'max_features' : 'sqrt'
    },
    {
        'test_id' : 'Test002',
        'n_estimators' : 50, 
        'criterion' : 'squared_error',
        'max_features' : 'log2'
    },
    {
        'test_id' : 'Test003',
        'n_estimators' : 50, 
        'criterion' : 'squared_error',
        'max_features' : None
    },
    {
        'test_id' : 'Test004',
        'n_estimators' : 50, 
        'criterion' : 'absolute_error',
        'max_features' : 'sqrt'
    },{
        'test_id' : 'Test005',
        'n_estimators' : 50, 
        'criterion' : 'absolute_error',
        'max_features' : 'log2'
    },
    {
        'test_id' : 'Test006',
        'n_estimators' : 50, 
        'criterion' : 'absolute_error',
        'max_features' : None
    },
    {
        'test_id' : 'Test007',
        'n_estimators' : 50, 
        'criterion' : 'friedman_mse',
        'max_features' : 'sqrt'
    },{
        'test_id' : 'Test008',
        'n_estimators' : 50, 
        'criterion' : 'friedman_mse',
        'max_features' : 'log2'
    },
    {
        'test_id' : 'Test009',
        'n_estimators' : 50, 
        'criterion' : 'friedman_mse',
        'max_features' : None
    },
    {
        'test_id' : 'Test010',
        'n_estimators' : 50, 
        'criterion' : 'poisson',
        'max_features' : 'sqrt'
    },{
        'test_id' : 'Test011',
        'n_estimators' : 50, 
        'criterion' : 'poisson',
        'max_features' : 'log2'
    },
    {
        'test_id' : 'Test012',
        'n_estimators' : 50, 
        'criterion' : 'poisson',
        'max_features' : None
    }
]

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

def run_test_scenarios(x_train, x_test, y_train, y_test, scenario):

    #we don't need to pass the parameter as testid s remove
    testid = scenario.pop('test_id')

    #display the which test screnario model we try to display
    print(f"Parameters: {testid}")
    print(scenario)

    #taken from chatgpt
    if hasattr(y_train, 'values'):  # check if it's a pandas object
        #print("debug")
        y_train = y_train.values.ravel()
    else:
        y_train = y_train.ravel()

    regressor = RandomForestRegressor(**scenario)
    
    regressor.fit(x_train, y_train)

    #prediction
    y_predict = regressor.predict(x_test)

    score = r2_score(y_test, y_predict.ravel())

    results = {
        'test_id' : testid,
        'r2 Score' : score
    }
    return results

In [11]:
for scenario in test_scenarios:
    result = run_test_scenarios(x_train, x_test, y_train, y_test, scenario)
    print (result)

Parameters: Test001
{'n_estimators': 100, 'criterion': 'squared_error', 'max_features': 'sqrt'}
{'test_id': 'Test001', 'r2 Score': 0.7870911599411645}
Parameters: Test002
{'n_estimators': 100, 'criterion': 'squared_error', 'max_features': 'log2'}
{'test_id': 'Test002', 'r2 Score': 0.7821750261514708}
Parameters: Test003
{'n_estimators': 100, 'criterion': 'squared_error', 'max_features': None}
{'test_id': 'Test003', 'r2 Score': 0.9410747550133562}
Parameters: Test004
{'n_estimators': 100, 'criterion': 'absolute_error', 'max_features': 'sqrt'}
{'test_id': 'Test004', 'r2 Score': 0.7987755535531339}
Parameters: Test005
{'n_estimators': 100, 'criterion': 'absolute_error', 'max_features': 'log2'}
{'test_id': 'Test005', 'r2 Score': 0.813415670491114}
Parameters: Test006
{'n_estimators': 100, 'criterion': 'absolute_error', 'max_features': None}
{'test_id': 'Test006', 'r2 Score': 0.9393236387507441}
Parameters: Test007
{'n_estimators': 100, 'criterion': 'friedman_mse', 'max_features': 'sqrt'}
{

In [12]:
#TO SAVE THE MODEL CHOOSED THE BEST RESULT OF ABOVE MODEL

In [13]:
import pickle

In [14]:
best_model = RandomForestRegressor(n_estimators = 50, criterion = 'friedman_mse', max_features = None)
best_model.fit(x_train, y_train)
y_predict=best_model.predict(x_test)


  return fit_method(estimator, *args, **kwargs)


In [15]:
bestModelSave = r2_score(y_test, y_predict)
bestModelSave

0.9340587483752462

In [16]:
#Save the model
filename = "RandomForestBestModel.sav"
pickle.dump(best_model, open(filename, 'wb'))

In [17]:
loaded_model = pickle.load(open(filename, 'rb'))

In [18]:
loaded_model.predict([[76253.86,113867.3,298664.47,1,0]])



array([116650.913])