In [1]:
import pandas as pd

# Load the data
df = pd.read_csv("global_startup_success_dataset.csv")

# Quick overview
print(df.shape)
print(df.dtypes)
df.head()


(5000, 15)
Startup Name                 object
Founded Year                  int64
Country                      object
Industry                     object
Funding Stage                object
Total Funding ($M)            int64
Number of Employees           int64
Annual Revenue ($M)           int64
Valuation ($B)              float64
Success Score                 int64
Acquired?                    object
IPO?                         object
Customer Base (Millions)      int64
Tech Stack                   object
Social Media Followers        int64
dtype: object


Unnamed: 0,Startup Name,Founded Year,Country,Industry,Funding Stage,Total Funding ($M),Number of Employees,Annual Revenue ($M),Valuation ($B),Success Score,Acquired?,IPO?,Customer Base (Millions),Tech Stack,Social Media Followers
0,Startup_1,2009,Canada,Healthcare,Series A,269,3047,104,46.11,5,No,No,43,"Java, Spring",4158814
1,Startup_2,2004,UK,Healthcare,IPO,40,630,431,33.04,1,No,Yes,64,"Node.js, React",4063014
2,Startup_3,2018,USA,Healthcare,Seed,399,2475,375,15.79,8,No,No,74,"PHP, Laravel",3449855
3,Startup_4,2014,France,Tech,Seed,404,1011,907,17.12,7,Yes,Yes,26,"Python, AI",630421
4,Startup_5,2006,Japan,Energy,Series C,419,3917,280,4.39,6,Yes,Yes,30,"Node.js, React",365956


In [2]:
# Drop unnecessary columns
df.drop(columns=["Startup Name"], inplace=True)

# Feature engineering: Add Startup Age
df["Startup Age"] = 2025 - df["Founded Year"]
df.drop(columns=["Founded Year"], inplace=True)

# Binary encoding for 'Acquired?' and 'IPO?'
df["Acquired?"] = df["Acquired?"].map({"Yes": 1, "No": 0})
df["IPO?"] = df["IPO?"].map({"Yes": 1, "No": 0})

# One-hot encode categorical features
df = pd.get_dummies(df, columns=["Country", "Industry", "Funding Stage", "Tech Stack"], drop_first=True)

# Drop rows with missing values
df.dropna(inplace=True)

# Define target and features
target = "Success Score"
X = df.drop(columns=[target])
y = df[target]

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

models = {
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, verbosity=0)
}

# Train models
for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"{name} trained.")


Random Forest trained.
XGBoost trained.


In [5]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} Results:")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R²:", r2_score(y_test, y_pred))


Random Forest Results:
MAE: 2.32935
MSE: 7.0256069
RMSE: 2.650586142723907
R²: -0.059605287765443826

XGBoost Results:
MAE: 2.421452045440674
MSE: 8.142317771911621
RMSE: 2.853474683944405
R²: -0.2280280590057373


In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

model_scores = {}
best_r2 = -999
best_model = None
best_model_name = ""

for name, model in models.items():
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    model_scores[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R2": r2
    }

    if r2 > best_r2:
        best_r2 = r2
        best_model = model
        best_model_name = name

print(f"\n✅ Best Model: {best_model_name} (R² = {best_r2:.4f})")


✅ Best Model: Random Forest (R² = -0.0596)


In [7]:
import pickle

# Save the best model
with open("success_score_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

# Save the feature columns (important for later use)
with open("model_features.pkl", "wb") as f:
    pickle.dump(X.columns.tolist(), f)
