In [2]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv("../dataset/layoffs_data.csv")
clean_df = pd.read_csv("../dataset/cleaned.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../dataset/layoffs_data.csv'

# Random Forest - Predicting When Layoffs Would Occur

In [None]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1145 entries, 2 to 3483
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Company       1145 non-null   object 
 1   Location_HQ   1145 non-null   object 
 2   Industry      1145 non-null   object 
 3   Percentage    1145 non-null   float64
 4   Date          1145 non-null   object 
 5   Funds_Raised  1145 non-null   float64
 6   Stage         1145 non-null   object 
 7   Country       1145 non-null   object 
 8   Layoffs       1145 non-null   float64
dtypes: float64(3), object(6)
memory usage: 89.5+ KB


In [None]:
# Extract the month and create a new 'Month' column
clean_df['Month'] = pd.to_datetime(clean_df['Date']).dt.month

# Drop the 'Percentage' column
random_forest_clean = clean_df.drop(columns = ['Date', 'Percentage'])

random_forest_clean.head()

Unnamed: 0,Company,Location_HQ,Industry,Funds_Raised,Stage,Country,Layoffs,Month
2,Vacasa,Portland,Travel,834.0,Post-IPO,United States,320.0,2
3,Treasury Prime,SF Bay Area,Finance,71.0,Series C,United States,40.0,2
6,Bumble,Austin,Consumer,313.0,Post-IPO,United States,350.0,2
16,Finder,Sydney,Retail,30.0,Unknown,Australia,60.0,2
20,Tails.com,London,Retail,5.0,Series A,United Kingdom,55.0,2


### Simple Version

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# One-hot encode categorical features
random_forest_clean_encoded = pd.get_dummies(random_forest_clean, columns = ['Company', 'Location_HQ', 'Industry', 'Stage', 'Country'], dtype = int)

# Define features and target
X = random_forest_clean_encoded.iloc[:,:-1]
y = random_forest_clean['Month']  # Target variable

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Train the Random Forest model
model = RandomForestRegressor(n_estimators = 100, random_state = 42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the regression model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print the evaluation scores with 3 decimals
print(f'R-squared: {r2:.3f}')
print(f'Mean Absolute Error: {mae:.3f}')
print(f'Mean Squared Error: {mse:.3f}')
print(f'Root Mean Squared Error: {rmse:.3f}')

ModuleNotFoundError: No module named 'sklearn'

In [None]:
from sklearn.model_selection import GridSearchCV, KFold, cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, r2_score, mean_absolute_error, mean_squared_error

# One-hot encode categorical features
random_forest_clean_encoded = (
    pd.get_dummies(
        random_forest_clean, 
        columns = ['Company', 'Location_HQ', 'Industry', 'Stage', 'Country'], 
        dtype = int
    )
)

# Define features and target
X = random_forest_clean_encoded.iloc[:, :-1]
y = random_forest_clean['Month']  # Target variable

# Create a RandomForestRegressor
model = RandomForestRegressor(random_state = 42, n_jobs = -1)

# Define the evaluation metrics
scoring = {
    'r2': 'r2',
    'mae': 'neg_mean_absolute_error',
    'mse': 'neg_mean_squared_error',
    'rmse': 'neg_root_mean_squared_error'
}

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Perform cross-validation
cv = KFold(n_splits = 5, shuffle = True, random_state = 42)

# Create GridSearchCV
grid_search = GridSearchCV(model, param_grid, scoring = scoring, cv = cv, refit = 'r2', return_train_score = False, n_jobs = -1)
grid_search.fit(X, y)

# Print the best hyperparameters
print('Best Hyperparameters:', grid_search.best_params_)

# Get the best model from GridSearchCV
best_model = grid_search.best_estimator_

# Evaluate the best model using cross-validation
cv_results = cross_validate(best_model, X, y, cv = cv, scoring = scoring)

# Print the cross-validated evaluation scores with 3 decimals
print(f'R-squared: {np.mean(cv_results["test_r2"]):.3f}')
print(f'Mean Absolute Error: {np.mean(cv_results["test_mae"]):.3f}')
print(f'Mean Squared Error: {np.mean(cv_results["test_mse"]):.3f}')
print(f'Root Mean Squared Error: {np.mean(cv_results["test_rmse"]):.3f}')