In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, f1_score
import warnings

# Suppress all warnings to keep the output clean
warnings.filterwarnings("ignore")

# --- Helper functions ---

def clean_numeric(col):
    """
    Cleans and converts a column to a numeric type, handling common symbols.
    """
    return pd.to_numeric(
        col.astype(str)
        .str.replace(",", "")
        .str.replace("$", "", regex=False)
        .str.extract(r"([0-9.]+)", expand=False),
        errors="coerce"
    )

def encode_categoricals(df, cat_cols):
    """
    Encodes specified categorical columns using OrdinalEncoder.
    """
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    df[cat_cols] = enc.fit_transform(df[cat_cols].astype(str))
    return df, enc

# --- Load and Preprocess Dataset ---

# Load dataset. Make sure the file path is correct.
df = pd.read_csv("Sorted_companies_data.csv", low_memory=False)

print("✅ Dataset loaded successfully.")
print("-" * 30)

# --- TASK 1: Total Funding Amount Prediction (Regression) ---

print("🚀 Running Task 1: Total Funding Amount Prediction")
task1_target = "Total Funding Amount"
task1_features = [
    "Founded Date", "Number of Founders", "Number of Employees", "Industries",
    "Headquarters Location", "Number of Investors", "Actively Hiring",
    "Number of Funding Rounds", "Last Funding Amount", "Funding Status", "Last Funding Type",
    "Estimated Revenue Range", "Number of Events", "BuiltWith - Active Tech Count"
]

# Clean numeric columns for Task 1
num_cols_task1 = ["Founded Date", "Number of Founders", "Number of Employees",
                  "Number of Investors", "Number of Funding Rounds", "Last Funding Amount",
                  "Number of Events", "BuiltWith - Active Tech Count"]
for col in num_cols_task1:
    df[col] = clean_numeric(df[col])
df[task1_target] = clean_numeric(df[task1_target])

# Encode categoricals for Task 1
cat_cols_task1 = list(set(task1_features) - set(num_cols_task1))
df_task1, _ = encode_categoricals(df.copy(), cat_cols_task1)

# Drop rows with any NaN values in the features or target for this task
df_task1.dropna(subset=task1_features + [task1_target], inplace=True)

# Train/test split for Task 1
X1 = df_task1[task1_features]
y1 = df_task1[task1_target]
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, train_size=0.15, test_size=0.10, random_state=42)

# Model for Task 1
model1 = RandomForestRegressor(n_estimators=200, random_state=42)
model1.fit(X1_train, y1_train)
y1_pred = model1.predict(X1_test)

print("📈 Task 1 Results:")
print("R² Score:", r2_score(y1_test, y1_pred))
print("MAE:", mean_absolute_error(y1_test, y1_pred))
print("-" * 30)

# --- TASK 2: Estimated Revenue Range Prediction (Classification) ---

print("🚀 Running Task 2: Estimated Revenue Range Prediction")
task2_target = "Estimated Revenue Range"
task2_features = [
    "Founded Date", "Number of Founders", "Number of Employees", "Industries",
    "Headquarters Location", "Headquarters Regions", "Number of Investors",
    "Actively Hiring", "Number of Funding Rounds", "Last Funding Amount",
    "Funding Status", "Last Funding Type", "Number of Events",
    "BuiltWith - Active Tech Count", "Total Funding Amount"
]

# Clean numeric columns for Task 2
num_cols_task2 = ["Founded Date", "Number of Founders", "Number of Employees",
                  "Number of Investors", "Number of Funding Rounds", "Last Funding Amount",
                  "Number of Events", "BuiltWith - Active Tech Count", "Total Funding Amount"]
for col in num_cols_task2:
    df[col] = clean_numeric(df[col])

# Encode categoricals for Task 2
cat_cols_task2 = list(set(task2_features) - set(num_cols_task2))
df_task2, _ = encode_categoricals(df.copy(), cat_cols_task2)

# Encode target for Task 2
target_encoder_task2 = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
df_task2[[task2_target]] = target_encoder_task2.fit_transform(df_task2[[task2_target]])

# Drop rows with any NaN values in the features or target for this task
df_task2.dropna(subset=task2_features + [task2_target], inplace=True)

# Train/test split for Task 2
X2 = df_task2[task2_features]
y2 = df_task2[task2_target]
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, train_size=0.15, test_size=0.10, random_state=42)

# Model for Task 2
model2 = RandomForestClassifier(n_estimators=200, random_state=42)
model2.fit(X2_train, y2_train)
y2_pred = model2.predict(X2_test)

print("📈 Task 2 Results:")
print("Accuracy:", accuracy_score(y2_test, y2_pred))
print("F1 Score (weighted):", f1_score(y2_test, y2_pred, average='weighted'))
print("-" * 30)

# --- TASK 3: Headquarters Location Prediction (Classification) ---

print("🚀 Running Task 3: Headquarters Location Prediction")
task3_target = "Headquarters Location"
task3_features = [
    "Founded Date", "Number of Founders", "Number of Employees", "Industries",
    "Headquarters Regions", "Number of Investors", "Actively Hiring",
    "Number of Funding Rounds", "Last Funding Amount", "Funding Status",
    "Last Funding Type", "Estimated Revenue Range", "Number of Events",
    "BuiltWith - Active Tech Count", "Total Funding Amount"
]

# Clean numeric columns for Task 3
num_cols_task3 = ["Founded Date", "Number of Founders", "Number of Employees",
                  "Number of Investors", "Number of Funding Rounds", "Last Funding Amount",
                  "Number of Events", "BuiltWith - Active Tech Count", "Total Funding Amount"]
for col in num_cols_task3:
    df[col] = clean_numeric(df[col])

# Encode categorical features for Task 3
cat_cols_task3 = list(set(task3_features) - set(num_cols_task3))
df_task3, _ = encode_categoricals(df.copy(), cat_cols_task3)

# Encode target for Task 3
target_encoder_task3 = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
df_task3[[task3_target]] = target_encoder_task3.fit_transform(df_task3[[task3_target]])

# Drop rows where target is NaN (if any left after encoding)
df_task3 = df_task3.dropna(subset=task3_features + [task3_target])

# Split data for Task 3
X3 = df_task3[task3_features]
y3 = df_task3[task3_target]
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, train_size=0.15, test_size=0.10, random_state=42)

# Train model for Task 3
model3 = RandomForestClassifier(n_estimators=200, random_state=42)
model3.fit(X3_train, y3_train)

# Predict & evaluate for Task 3
y3_pred = model3.predict(X3_test)

print("📈 Task 3 Results:")
print("Accuracy:", accuracy_score(y3_test, y3_pred))
print("F1 Score (weighted):", f1_score(y3_test, y3_pred, average='weighted'))
print("-" * 30)

print("\n✅ All tasks completed.")
