In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Load the dataset
data = pd.read_csv(r"C:\Users\manda\OneDrive\Desktop\Data Science\Machine learning assignment\Random Forest\Company_Data.csv")

In [3]:
# To convert numerical columns to categprical we need to create bins and labels
bins = [0, 5, 10, float('inf')]
labels = ['Low', 'Medium', 'High']

In [4]:
# Create a new column 'Sales_Cat' with categorized data
data['Sales_Cat'] = pd.cut(data['Sales'], bins=bins, labels=labels, include_lowest=True)

In [5]:
# Lets do label Encoding 
le = LabelEncoder()

In [6]:
data['ShelveLoc'] = le.fit_transform(data['ShelveLoc'])
data['Urban'] = le.fit_transform(data['Urban'])
data['US'] = le.fit_transform(data['US'])

In [7]:
data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,Sales_Cat
0,9.50,138,73,11,276,120,0,42,17,1,1,Medium
1,11.22,111,48,16,260,83,1,65,10,1,1,High
2,10.06,113,35,10,269,80,2,59,12,1,1,High
3,7.40,117,100,4,466,97,2,55,14,1,1,Medium
4,4.15,141,64,3,340,128,0,38,13,1,0,Low
...,...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,1,33,14,1,1,High
396,6.14,139,23,3,37,120,2,55,11,0,1,Medium
397,7.41,162,26,12,368,159,2,40,18,1,1,Medium
398,5.94,100,79,7,284,95,0,50,12,1,1,Medium


In [8]:
# Split data into features (X) and target (y)
X = data.drop(['Sales', 'Sales_Cat'], axis=1)
y = data['Sales_Cat']

In [9]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8, 10, None],
    'criterion': ['gini', 'entropy']
}

In [11]:
# Initialize the RandomForestClassifier
comp = RandomForestClassifier(random_state=42)

In [12]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=comp, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [13]:
# Fit the model
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


150 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\manda\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\manda\anaconda3\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\manda\anaconda3\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\manda\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParame

In [14]:
# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 50}
Best Score: 0.6285714285714287


In [15]:
# Get the best estimator
best_rf = grid_search.best_estimator_

In [16]:
best_rf

In [17]:
# Make predictions
y_pred = best_rf.predict(X_test)

In [5]:
# Evaluate the model
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best Parameters: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 50}
Best Score: 0.6285714285714287
              precision    recall  f1-score   support

        High       0.85      0.46      0.59        24
         Low       0.80      0.21      0.33        19
      Medium       0.73      0.96      0.83        77

    accuracy                           0.74       120
   macro avg       0.79      0.54      0.58       120
weighted avg       0.76      0.74      0.70       120

Accuracy: 0.7416666666666667


150 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\manda\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\manda\anaconda3\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\manda\anaconda3\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\manda\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParame