In [1]:
import pandas as pd
import numpy as np

In [2]:
import datetime as dt

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

In [6]:
train = pd.read_csv('train.csv')

In [7]:
train.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,07/17/1999,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,02/14/2008,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,03/09/2013,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,02/02/2012,Tokat,Other,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,05/09/2009,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715.0


In [8]:
train.columns

Index(['Id', 'Open Date', 'City', 'City Group', 'Type', 'P1', 'P2', 'P3', 'P4',
       'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15',
       'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25',
       'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35',
       'P36', 'P37', 'revenue'],
      dtype='object')

In [9]:
train_features_cols = ['P1', 'P2', 'P3', 'P4',
       'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15',
       'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25',
       'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35',
       'P36', 'P37']

In [10]:
train_result_col = ['revenue']

In [11]:
rng = np.random.default_rng(1488)
valid_test_idx = list(rng.integers(low=0, high=137, size=int(len(train)*0.2)))

In [15]:
train_cleaned = train[train_features_cols+train_result_col]

In [16]:
train_df, valid_df = train_test_split(train_cleaned[~train_cleaned.index.isin(valid_test_idx)], test_size = 0.25, random_state = 1488)

In [17]:
train_df.columns

Index(['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11',
       'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20', 'P21',
       'P22', 'P23', 'P24', 'P25', 'P26', 'P27', 'P28', 'P29', 'P30', 'P31',
       'P32', 'P33', 'P34', 'P35', 'P36', 'P37', 'revenue'],
      dtype='object')

In [18]:
features_train = train_df.drop(['revenue'], axis='columns')
target_train = train_df['revenue']

In [19]:
features_valid = valid_df.drop(['revenue'], axis='columns')
target_valid = valid_df['revenue']

In [20]:
test_valid_df = train_df[train_df.index.isin(valid_test_idx)]

In [21]:
features_test_valid_df = test_valid_df.drop(['revenue'], axis='columns')
target_test_valid_df = test_valid_df['revenue']

In [22]:
tree_params = {'criterion':['mse'],
               'max_depth' : range(1,5),
              'max_features' : ['auto','log2']}

In [23]:
locally_best_tree = GridSearchCV(RandomForestRegressor(random_state=554321), tree_params)
locally_best_tree.fit(features_train, target_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False,
                                             random_state=554321, verbose=0,
                                             warm

In [24]:
locally_best_tree

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False,
                                             random_state=554321, verbose=0,
                                             warm

In [25]:
locally_best_tree.best_params_

{'criterion': 'mse', 'max_depth': 3, 'max_features': 'log2'}

In [26]:
locally_best_tree.best_score_

-0.4179352091637565

In [27]:
locally_best_tree_valid_predictions = locally_best_tree.predict(features_valid)


In [28]:
result = mean_squared_error(target_valid, locally_best_tree_valid_predictions)**0.5

In [29]:
result

2037643.1476588855

In [215]:
locally_best_tree

GridSearchCV(estimator=RandomForestRegressor(random_state=554321),
             param_grid={'criterion': ['poisson'], 'max_depth': range(1, 10),
                         'max_features': ['auto', 'sqrt', 'log2']})

In [216]:
locally_best_tree.best_estimator_

RandomForestRegressor(criterion='poisson', max_depth=1, max_features='log2',
                      random_state=554321)

In [30]:
import pickle # при помощи pickle можно задампить некий объект питона

In [31]:
pickle.dump(locally_best_tree,open('../models/rforestregressor.pkt', 'wb')) #запись модели в бинарный файл 



In [32]:
loaded_model = pickle.load(open('../models/rforestregressor.pkt','rb')) #читаем модель из бинарного файла
