In [None]:
%pip install scikit-learn pandas numpy matplotlib scipy seaborn

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.pipeline import Pipeline

In [None]:
# Assuming the dataset is named 'startup_data.csv'
startup_data = pd.read_csv('../Datasets/50_Startups.csv')

In [None]:
# Preprocess the data
label_encoder = LabelEncoder()
startup_data['State'] = label_encoder.fit_transform(startup_data['State'])

X = startup_data.drop(columns=['Profit'])
y = startup_data['Profit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Train RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_scaled, y_train)

# Make predictions using RandomForestRegressor
y_pred_rf = rf_regressor.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print("Mean Squared Error (Random Forest):", mse_rf)

In [None]:
# Calculate R^2 score
# Evaluates how well the model explains the variance in the target variable
r2_rf = r2_score(y_test, y_pred_rf)
print("R^2 Score (Random Forest):", r2_rf)

# Calculate Pearson correlation coefficient
# Measures the linear relationship between the predicted and actual values
corr_coefficient, _ = pearsonr(y_test, y_pred_rf)
print("Pearson Correlation Coefficient:", corr_coefficient)

It's unusual to have such a high Pearson correlation coefficient (close to 1) and a slightly lower R^2 score (but still high). This might indicate that while there's a strong linear relationship between the predicted and actual values, the model might not be capturing all the variance in the target variable as efficiently as it captures the linear relationship.

In [None]:
# Let's try to improve the Random Forest regression model's ability to capture all the variance in the target variable.

# The code implementation demonstrates a comprehensive approach to improving the Random Forest regression model's performance 
#   by incorporating feature engineering (polynomial features), hyperparameter tuning (GridSearchCV), and model evaluation. 

#   The pipeline ensures that all preprocessing steps and modeling steps are applied consistently during training and prediction.

# Define a pipeline for preprocessing and modeling
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),  # Adding polynomial features
    ('rf', RandomForestRegressor(random_state=42))
])

# Define hyperparameters for GridSearchCV
param_grid = {
    'poly__degree': [2, 3],  # Degree of polynomial features
    'rf__n_estimators': [100, 200, 300],  # Number of trees in the forest
    'rf__max_depth': [None, 5, 10]  # Maximum depth of the trees
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions using the best model
y_pred_rf_improved = best_model.predict(X_test)

In [None]:
# Calculate R^2 score
# Evaluates how well the model explains the variance in the target variable
r2_rf = r2_score(y_test, y_pred_rf_improved)
print("R^2 Score (Random Forest):", r2_rf)

# Calculate Pearson correlation coefficient
# Measures the linear relationship between the predicted and actual values
corr_coefficient, _ = pearsonr(y_test, y_pred_rf_improved)
print("Pearson Correlation Coefficient:", corr_coefficient)

In [None]:
# Plot actual vs. predicted profits
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')
plt.xlabel('Actual Profit')
plt.ylabel('Predicted Profit')
plt.title('Actual vs. Predicted Profits (Random Forest)')
plt.grid(True)
plt.show()


In [None]:
# Plot histogram of actual and predicted profits
plt.figure(figsize=(10, 6))
plt.hist(y_test, bins=20, color='blue', alpha=0.5, label='Actual Profit')
plt.hist(y_pred_rf, bins=20, color='green', alpha=0.5, label='Predicted Profit')
plt.xlabel('Profit')
plt.ylabel('Frequency')
plt.title('Histogram of Actual and Predicted Profits')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Calculate the correlation matrix
corr_matrix = startup_data.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()