<a href="https://colab.research.google.com/github/nikgup6/DataThon_Team_3/blob/main/DataThon_31Columns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Step 1.1: Load dataset
file_path = '/content/Armenia.csv'
df = pd.read_csv(file_path)

# Step 1.2: Show shape
print(f"✅ Dataset Shape: {df.shape}")

# Step 1.3: Calculate missing percentage
missing_percent = df.isnull().mean().sort_values(ascending=False) * 100
missing_percent = missing_percent[missing_percent > 0]

# Display missing percentage
pd.set_option('display.max_rows', 150)
print("🔍 Missing Value Percentage (Non-Zero Only):")
print(missing_percent)


✅ Dataset Shape: (49, 131)
🔍 Missing Value Percentage (Non-Zero Only):
Number of Exits                                  100.000000
Diversity Spotlight (US Headquarters Only)       100.000000
Headquarters Regions                             100.000000
Number of Private Contacts                       100.000000
Number of Private Notes                          100.000000
IPqwery - Most Popular Patent Class              100.000000
Actively Hiring                                  100.000000
Number of Exits (IPO)                            100.000000
Accelerator Duration (in weeks)                  100.000000
Accelerator Application Deadline                 100.000000
Accelerator Program Type                         100.000000
School Type                                      100.000000
School Program                                   100.000000
Number of Enrollments                            100.000000
School Method                                    100.000000
Number of Founders (Alumni)  

In [None]:
# ✅ Step 2.1: Select 31 core columns based on the hackathon PDF/docx guidance
important_columns = [
    "Founded Date", "Number of Founders", "Company Type", "Number of Employees",
    "Industries", "Headquarters Location", "Headquarters Regions", "Number of Investors",
    "Actively Hiring", "Number of Funding Rounds", "Last Funding Amount",
    "Funding Status", "Last Funding Type", "Estimated Revenue Range", "IPqwery - Patents Granted",
    "SEMrush - Monthly Visits", "SEMrush - Visit Duration", "SEMrush - Page Views / Visit",
    "SEMrush - Bounce Rate", "Number of Events", "BuiltWith - Active Tech Count",
    "G2 Stack - Total Products Active", "Number of Articles", "CB Rank (Company)",
    "Total Funding Amount", "Valuation at IPO", "Price", "Number of Exits",
    "Industry Groups", "Apptopia - Downloads Last 30 Days", "Apptopia - Number of Apps"
]

# Subset the dataframe
df_cleaned = df[important_columns].copy()

# Save to a new CSV working file
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)
print("✅ Created working file: /content/Canada_cleaned.csv with 31 key columns")


✅ Created working file: /content/Canada_cleaned.csv with 31 key columns


In [None]:
# Step 3.1: Fix CB Rank (Company) format
df_cleaned["CB Rank (Company)"] = df_cleaned["CB Rank (Company)"].astype(str).str.replace(",", "")
df_cleaned["CB Rank (Company)"] = pd.to_numeric(df_cleaned["CB Rank (Company)"], errors='coerce')

# Now fill missing values
print("Before:", df_cleaned['CB Rank (Company)'].isnull().sum())

# Grouped median by industry (if applicable)
if 'Industries' in df_cleaned.columns:
    df_cleaned['CB Rank (Company)'] = df_cleaned.groupby('Industries')["CB Rank (Company)"].transform(
        lambda x: x.fillna(x.median())
    )

# Fallback to global median
df_cleaned["CB Rank (Company)"] = df_cleaned["CB Rank (Company)"].fillna(df_cleaned["CB Rank (Company)"].median())

print("After:", df_cleaned['CB Rank (Company)'].isnull().sum())

# Save updated file
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)
print("✅ CB Rank (Company) cleaned and imputed.")


Before: 0
After: 0
✅ CB Rank (Company) cleaned and imputed.


In [None]:
# Check if 'Founded Year' exists in the original dataset
if 'Founded Year' in df.columns:
    print("✅ 'Founded Year' exists. Using it to fill missing 'Founded Date'")

    # Convert 'Founded Date' to datetime
    df_cleaned['Founded Date'] = pd.to_datetime(df_cleaned['Founded Date'], errors='coerce')

    # Use 'Founded Year' to fill missing 'Founded Date'
    df_cleaned['Founded Date'] = df_cleaned['Founded Date'].fillna(
        pd.to_datetime(df['Founded Year'], errors='coerce')
    )

else:
    print("⚠️ 'Founded Year' not found. Will impute using group-wise median")

    # Convert 'Founded Date' to datetime
    df_cleaned['Founded Date'] = pd.to_datetime(df_cleaned['Founded Date'], errors='coerce')

    # Grouped median fill by Industry
    df_cleaned['Founded Date'] = df_cleaned.groupby('Industries')['Founded Date'].transform(
        lambda x: x.fillna(x.median())
    )

# Final fallback to global median
df_cleaned['Founded Date'] = df_cleaned['Founded Date'].fillna(df_cleaned['Founded Date'].median())

# Check result
print("✅ Founded Date Missing After Imputation:", df_cleaned['Founded Date'].isnull().sum())

# Save update
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)
print("✅ Founded Date imputed and saved.")


⚠️ 'Founded Year' not found. Will impute using group-wise median
✅ Founded Date Missing After Imputation: 0
✅ Founded Date imputed and saved.


In [None]:
# Step 3.3: Impute Company Type

print("Before:", df_cleaned["Company Type"].isnull().sum())

# Mode per industry group
df_cleaned["Company Type"] = df_cleaned.groupby("Industries")["Company Type"].transform(
    lambda x: x.fillna(x.mode().iloc[0]) if not x.mode().empty else x
)

# Fallback global mode
global_mode = df_cleaned["Company Type"].mode().iloc[0]
df_cleaned["Company Type"] = df_cleaned["Company Type"].fillna(global_mode)

print("After:", df_cleaned["Company Type"].isnull().sum())

# Save changes
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)
print("✅ Company Type imputed and saved.")


Before: 2
After: 0
✅ Company Type imputed and saved.


In [None]:
# Step 3.4: Impute Industries
print("Before:", df_cleaned["Industries"].isnull().sum())

# Group-based mode (by Company Type)
df_cleaned["Industries"] = df_cleaned.groupby("Company Type")["Industries"].transform(
    lambda x: x.fillna(x.mode().iloc[0]) if not x.mode().empty else x
)

# Fallback to global mode if still missing
global_mode = df_cleaned["Industries"].mode().iloc[0]
df_cleaned["Industries"] = df_cleaned["Industries"].fillna(global_mode)

print("After:", df_cleaned["Industries"].isnull().sum())

# Save update
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)
print("✅ Industries imputed and saved.")


Before: 3
After: 0
✅ Industries imputed and saved.


In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# Step A: Clean "Number of Employees"
def parse_employees(val):
    try:
        val = str(val)
        if "-" in val:
            a, b = val.split("-")
            return (int(a.replace(",", "")) + int(b.replace(",", ""))) // 2
        elif val.isdigit():
            return int(val)
        else:
            return np.nan
    except:
        return np.nan

df_cleaned["Number of Employees"] = df_cleaned["Number of Employees"].apply(parse_employees)

# Step B: ML Imputation
features = [
    "Estimated Revenue Range", "SEMrush - Monthly Visits",
    "CB Rank (Company)", "Industries", "Funding Status"
]

train_df = df_cleaned[df_cleaned["Number of Employees"].notnull()].copy()
test_df = df_cleaned[df_cleaned["Number of Employees"].isnull()].copy()

# Encode categorical features
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train_encoded = train_df[features].copy()
test_encoded = test_df[features].copy()

for col in train_encoded.columns:
    if train_encoded[col].dtype == "object":
        train_encoded[col] = train_encoded[col].astype(str)
        test_encoded[col] = test_encoded[col].astype(str)

encoder.fit(train_encoded)
X_train = encoder.transform(train_encoded)
X_test = encoder.transform(test_encoded)

y_train = train_df["Number of Employees"]

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict missing values
y_pred = model.predict(X_test)

# Fill in predictions
df_cleaned.loc[df_cleaned["Number of Employees"].isnull(), "Number of Employees"] = y_pred.astype(int)

# Save
print("✅ Missing After:", df_cleaned["Number of Employees"].isnull().sum())
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)
print("✅ Number of Employees imputed with RandomForest and saved.")


✅ Missing After: 0
✅ Number of Employees imputed with RandomForest and saved.


In [None]:
# Clean numeric columns with commas
numeric_fix_cols = ["Last Funding Amount", "SEMrush - Monthly Visits", "SEMrush - Visit Duration"]

for col in numeric_fix_cols:
    df_cleaned[col] = df_cleaned[col].astype(str).str.replace(",", "")
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')


In [None]:
print("🧪 Unique values in Estimated Revenue Range (non-null):")
print(df_cleaned["Estimated Revenue Range"].dropna().unique())


🧪 Unique values in Estimated Revenue Range (non-null):
['$1M to $10M' 'Less than $1M' '$1B to $10B' '$50M to $100M']


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# === Step 0: Clean any comma-based numeric columns ===
numeric_fix_cols = ["Last Funding Amount", "SEMrush - Monthly Visits", "SEMrush - Visit Duration"]

for col in numeric_fix_cols:
    df_cleaned[col] = df_cleaned[col].astype(str).str.replace(",", "")
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')

# === Step 1: Define usable rows and clean unexpected labels ===
target_col = "Estimated Revenue Range"

# Define expected ordered labels
ordered_labels = [
    "Less than $1M", "$1M to $10M", "$10M to $50M",
    "$50M to $100M", "$100M to $500M", "$500M to $1B", "$1B+"
]

# Map all large values into final "catch-all" label
df_cleaned[target_col] = df_cleaned[target_col].replace({
    "$1B to $10B": "$1B+",
    "$10B+": "$1B+"
}).str.strip()

# Filter training and test sets
train_df = df_cleaned[df_cleaned[target_col].isin(ordered_labels)].copy()
test_df = df_cleaned[df_cleaned[target_col].isnull()].copy()

# === Step 2: Label encode the target ===
label_mapping = {label: idx for idx, label in enumerate(ordered_labels)}
inv_label_map = {v: k for k, v in label_mapping.items()}

train_df[target_col] = train_df[target_col].map(label_mapping)

# === Step 3: Define features and encode them ===
features = [
    "Number of Employees", "CB Rank (Company)", "Funding Status",
    "Last Funding Amount", "SEMrush - Monthly Visits", "SEMrush - Visit Duration",
    "Industries"
]

cat_features = ["Funding Status", "Industries"]
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

# Encode categorical features
X_train = train_df[features].copy()
X_test = test_df[features].copy()

for col in cat_features:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

encoder.fit(X_train[cat_features])
X_train[cat_features] = encoder.transform(X_train[cat_features])
X_test[cat_features] = encoder.transform(X_test[cat_features])

# === Step 4: Train the classifier ===
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, train_df[target_col])

# === Step 5: Predict and map back to original labels ===
y_pred = clf.predict(X_test)
y_pred_labels = [inv_label_map[int(p)] for p in y_pred]

# Fill the predicted labels into df_cleaned
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = y_pred_labels

# === Step 6: Save updated file ===
print("✅ Estimated Revenue Range missing after:", df_cleaned[target_col].isnull().sum())
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)
print("✅ Estimated Revenue Range imputed and saved.")


✅ Estimated Revenue Range missing after: 0
✅ Estimated Revenue Range imputed and saved.


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import pandas as pd

# === Step A: Clean commas and convert to float ===
df_cleaned["Last Funding Amount"] = df_cleaned["Last Funding Amount"].astype(str).str.replace(",", "")
df_cleaned["Last Funding Amount"] = pd.to_numeric(df_cleaned["Last Funding Amount"], errors='coerce')

# === Step B: Prepare training and test sets ===
target_col = "Last Funding Amount"

# Split based on availability
train_df = df_cleaned[df_cleaned[target_col].notnull()].copy()
test_df = df_cleaned[df_cleaned[target_col].isnull()].copy()

# Features to use
features = [
    "Estimated Revenue Range", "Number of Employees", "CB Rank (Company)",
    "Industries", "Funding Status"
]

cat_features = ["Estimated Revenue Range", "Industries", "Funding Status"]

# Encode categorical columns
X_train = train_df[features].copy()
X_test = test_df[features].copy()

# Convert all categorical to string
for col in cat_features:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

# Ordinal encoding for categorical features
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(X_train[cat_features])
X_train[cat_features] = encoder.transform(X_train[cat_features])
X_test[cat_features] = encoder.transform(X_test[cat_features])

# Target variable
y_train = train_df[target_col]

# === Step C: Train the model ===
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# === Step D: Predict missing values ===
y_pred = model.predict(X_test)

# Fill in predictions
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = y_pred

# Save updated file
print("✅ Last Funding Amount missing after:", df_cleaned[target_col].isnull().sum())
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)
print("✅ Last Funding Amount imputed and saved.")


✅ Last Funding Amount missing after: 0
✅ Last Funding Amount imputed and saved.


In [None]:
print("🧪 Unique values in 'Funding Status' (non-null):")
print(df_cleaned["Funding Status"].dropna().unique())


🧪 Unique values in 'Funding Status' (non-null):
['M&A' 'Seed' 'Early Stage Venture']


In [None]:




from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder

# === Step A: Target column ===
target_col = "Funding Status"

# Train/test split
train_df = df_cleaned[df_cleaned[target_col].notnull()].copy()
test_df = df_cleaned[df_cleaned[target_col].isnull()].copy()

# Features used (already cleaned/imputed)
features = [
    "Estimated Revenue Range", "Last Funding Amount", "CB Rank (Company)",
    "Number of Employees", "Industries"
]

cat_features = ["Estimated Revenue Range", "Industries"]

# Convert categoricals to string
X_train = train_df[features].copy()
X_test = test_df[features].copy()
for col in cat_features:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

# Ordinal encode categoricals
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
encoder.fit(X_train[cat_features])
X_train[cat_features] = encoder.transform(X_train[cat_features])
X_test[cat_features] = encoder.transform(X_test[cat_features])

# Prepare target
y_train = train_df[target_col].astype(str)

# === Step B: Train the model ===
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# === Step C: Predict and fill ===
y_pred = clf.predict(X_test)
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = y_pred

# Save file
print("✅ Funding Status missing after:", df_cleaned[target_col].isnull().sum())
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)
print("✅ Funding Status imputed and saved.")


✅ Funding Status missing after: 0
✅ Funding Status imputed and saved.


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder

# === Step A: Clean column (remove commas if any) ===
df_cleaned["Total Funding Amount"] = df_cleaned["Total Funding Amount"].astype(str).str.replace(",", "")
df_cleaned["Total Funding Amount"] = pd.to_numeric(df_cleaned["Total Funding Amount"], errors='coerce')

# === Step B: Train/test split ===
target_col = "Total Funding Amount"
train_df = df_cleaned[df_cleaned[target_col].notnull()].copy()
test_df = df_cleaned[df_cleaned[target_col].isnull()].copy()

# === Step C: Features for prediction ===
features = [
    "Last Funding Amount", "Estimated Revenue Range", "Number of Employees",
    "CB Rank (Company)", "Industries", "Funding Status"
]
cat_features = ["Estimated Revenue Range", "Industries", "Funding Status"]

# === Step D: Encode categorical features ===
X_train = train_df[features].copy()
X_test = test_df[features].copy()

for col in cat_features:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
encoder.fit(X_train[cat_features])

X_train[cat_features] = encoder.transform(X_train[cat_features])
X_test[cat_features] = encoder.transform(X_test[cat_features])

y_train = train_df[target_col]

# === Step E: Train model ===
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# === Step F: Predict and fill ===
y_pred = model.predict(X_test)
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = y_pred

# === Step G: Save file ===
print("✅ Total Funding Amount missing after:", df_cleaned[target_col].isnull().sum())
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)
print("✅ Total Funding Amount imputed and saved.")


✅ Total Funding Amount missing after: 0
✅ Total Funding Amount imputed and saved.


In [None]:



from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder

# === Step A: Clean Valuation column ===
target_col = "Valuation at IPO"
df_cleaned[target_col] = df_cleaned[target_col].astype(str).str.replace(",", "")
df_cleaned[target_col] = pd.to_numeric(df_cleaned[target_col], errors='coerce')

# === Step B: Focus only on IPO companies ===
ipo_df = df_cleaned[df_cleaned["Funding Status"] == "IPO"].copy()

# Train/test split (within IPO rows only)
train_df = ipo_df[ipo_df[target_col].notnull()].copy()
test_df = ipo_df[ipo_df[target_col].isnull()].copy()

# If very few rows exist with actual values, just skip
if len(train_df) < 10:
    print("⚠️ Not enough IPO rows with real valuations. Skipping imputation.")
else:
    # === Step C: Define features ===
    features = [
        "Total Funding Amount", "Last Funding Amount", "Estimated Revenue Range",
        "CB Rank (Company)", "Number of Employees", "Industries"
    ]
    cat_features = ["Estimated Revenue Range", "Industries"]

    X_train = train_df[features].copy()
    X_test = test_df[features].copy()

    for col in cat_features:
        X_train[col] = X_train[col].astype(str)
        X_test[col] = X_test[col].astype(str)

    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    encoder.fit(X_train[cat_features])
    X_train[cat_features] = encoder.transform(X_train[cat_features])
    X_test[cat_features] = encoder.transform(X_test[cat_features])

    y_train = train_df[target_col]

    # === Step D: Train model ===
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # === Step E: Predict and insert ===
    y_pred = model.predict(X_test)

    df_cleaned.loc[test_df.index, target_col] = y_pred

    # Save file
    print("✅ Valuation at IPO missing after:", df_cleaned[target_col].isnull().sum())
    df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)
    print("✅ Valuation at IPO imputed and saved (only for IPO-tagged startups).")


⚠️ Not enough IPO rows with real valuations. Skipping imputation.


In [None]:
import pandas as pd

# Load cleaned file
df_cleaned = pd.read_csv('/content/Canada_cleaned.csv')

# Summary
print("✅ Shape:", df_cleaned.shape)
print("\n📉 Missing Values (%):")
print(df_cleaned.isnull().mean().sort_values(ascending=False) * 100)

# Show sample rows
df_cleaned.head()


✅ Shape: (49, 31)

📉 Missing Values (%):
Headquarters Regions                 100.000000
Actively Hiring                      100.000000
Valuation at IPO                     100.000000
Number of Exits                      100.000000
Price                                100.000000
IPqwery - Patents Granted             95.918367
Number of Investors                   79.591837
Number of Events                      77.551020
Apptopia - Downloads Last 30 Days     77.551020
G2 Stack - Total Products Active      75.510204
Apptopia - Number of Apps             75.510204
Last Funding Type                     69.387755
Number of Funding Rounds              69.387755
Number of Articles                    61.224490
SEMrush - Monthly Visits              46.938776
SEMrush - Page Views / Visit          46.938776
SEMrush - Visit Duration              46.938776
SEMrush - Bounce Rate                 46.938776
Number of Founders                    38.775510
BuiltWith - Active Tech Count         22.448980

Unnamed: 0,Founded Date,Number of Founders,Company Type,Number of Employees,Industries,Headquarters Location,Headquarters Regions,Number of Investors,Actively Hiring,Number of Funding Rounds,...,G2 Stack - Total Products Active,Number of Articles,CB Rank (Company),Total Funding Amount,Valuation at IPO,Price,Number of Exits,Industry Groups,Apptopia - Downloads Last 30 Days,Apptopia - Number of Apps
0,2017-01-01 00:00:00,,For Profit,30.0,"Information Technology, Mobile Apps, Software,...","Yerevan, Yerevan, Armenia",,,,,...,,,2049738.0,900013.0,,,,"Apps, Information Technology, Mobile, Software",,
1,2016-01-01 00:00:00,1.0,For Profit,30.0,"E-Commerce, FinTech, Information Technology, S...","Yerevan, Yerevan, Armenia",,,,,...,,,784580.0,6636210.0,,,,"Commerce and Shopping, Financial Services, Inf...",,
2,2022-04-25 00:00:00,1.0,For Profit,5.0,"Advertising, Marketing, Professional Services","Yerevan, Yerevan, Armenia",,,,,...,,,666127.0,631109.0,,,,"Advertising, Other, Sales and Marketing",,
3,2014-01-18 00:00:00,1.0,For Profit,75.0,"Information Technology, Software","Yerevan, Yerevan, Armenia",,,,,...,18.0,57.0,203866.0,7031460.0,,,,"Information Technology, Software",89.0,133.0
4,2023-05-02 00:00:00,,For Profit,5.0,"Blockchain, Cryptocurrency, Gamification, Gami...","Yerevan, Yerevan, Armenia",,,,,...,,,373491.0,632239.0,,,,"Financial Services, Gaming, Information Techno...",,


In [None]:
col = "Industry Groups"

# A. Basic info
print(f"🔍 Column: {col}")
print(f"➡️ Missing: {df_cleaned[col].isnull().sum()} / {len(df_cleaned)}")
print(f"➡️ Missing %: {df_cleaned[col].isnull().mean()*100:.2f}%")

# B. Unique non-null values
print("\n🧪 Unique non-null values:")
print(df_cleaned[col].dropna().unique()[:30])  # show first 30 unique values

# C. Value counts (Top 10)
print("\n📊 Most common values:")
print(df_cleaned[col].value_counts().head(10))


🔍 Column: Industry Groups
➡️ Missing: 3 / 49
➡️ Missing %: 6.12%

🧪 Unique non-null values:
['Apps, Information Technology, Mobile, Software'
 'Commerce and Shopping, Financial Services, Information Technology, Software'
 'Advertising, Other, Sales and Marketing'
 'Information Technology, Software'
 'Financial Services, Gaming, Information Technology, Internet Services, Other, Payments, Software'
 'Administrative Services, Education, Professional Services'
 'Financial Services, Lending and Investments'
 'Education, Information Technology, Software' 'Other, Software'
 'Content and Publishing, Information Technology, Media and Entertainment, Sales and Marketing, Software'
 'Commerce and Shopping, Information Technology, Software'
 'Content and Publishing, Media and Entertainment, Software, Video'
 'Advertising, Design, Internet Services, Sales and Marketing, Software'
 'Apps, Hardware, Information Technology, Internet Services, Mobile, Platforms, Software'
 'Apps, Artificial Intelligence

In [None]:




# Only fill nulls using mode per 'Industries'
def fill_by_grouped_mode(df, target_col, group_col):
    df[target_col] = df.groupby(group_col)[target_col].transform(
        lambda x: x.fillna(x.mode().iloc[0]) if not x.mode().empty else x
    )
    return df

# Apply
df_cleaned = fill_by_grouped_mode(df_cleaned, "Industry Groups", "Industries")

# Confirm
print("✅ Missing after imputation:", df_cleaned["Industry Groups"].isnull().sum())

# Save
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)


✅ Missing after imputation: 0


In [None]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder

# Step A: Setup
target_col = "BuiltWith - Active Tech Count"
features = [
    "Number of Employees", "Total Funding Amount",
    "Estimated Revenue Range", "Industry Groups"
]

# Step B: Prepare a shared encoder for both train and test
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

# Combine for encoding
non_null_rows = df_cleaned.dropna(subset=features)
encoder.fit(non_null_rows[["Estimated Revenue Range", "Industry Groups"]])

# Encode both train and predict parts
df_encoded = df_cleaned.copy()
df_encoded[["Estimated Revenue Range", "Industry Groups"]] = encoder.transform(
    df_encoded[["Estimated Revenue Range", "Industry Groups"]]
)

# Step C: Train the model
train_df = df_encoded.dropna(subset=features + [target_col])
X_train = train_df[features]
y_train = train_df[target_col]

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step D: Predict and fill
pred_df = df_encoded[df_cleaned[target_col].isnull()]
X_pred = pred_df[features]
y_pred = model.predict(X_pred)

# Fill values into original dataframe
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = y_pred

# Step E: Save
print("✅ Missing after imputation:", df_cleaned[target_col].isnull().sum())
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)


✅ Missing after imputation: 0


In [None]:
col = "Number of Founders"

# A. Basic info
print(f"🔍 Column: {col}")
print(f"➡️ Missing: {df_cleaned[col].isnull().sum()} / {len(df_cleaned)}")
print(f"➡️ Missing %: {df_cleaned[col].isnull().mean()*100:.2f}%")

# B. Unique values (non-null)
print("\n🧪 Unique non-null values:")
print(sorted(df_cleaned[col].dropna().unique()))

# C. Correlation analysis (optional preview)
print("\n📊 Correlation with:")
print(df_cleaned[[col, "Number of Employees", "Total Funding Amount"]].corr())


🔍 Column: Number of Founders
➡️ Missing: 19 / 49
➡️ Missing %: 38.78%

🧪 Unique non-null values:
[np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0)]

📊 Correlation with:
                      Number of Founders  Number of Employees  \
Number of Founders              1.000000             0.306291   
Number of Employees             0.306291             1.000000   
Total Funding Amount           -0.142456             0.088259   

                      Total Funding Amount  
Number of Founders               -0.142456  
Number of Employees               0.088259  
Total Funding Amount              1.000000  


In [None]:
col = "Number of Founders"
group_cols = ["Company Type", "Industries"]

# Step 1: Group-median logic
df_cleaned[col] = df_cleaned.groupby(group_cols)[col].transform(
    lambda x: x.fillna(x.median())
)

# Step 2: If any left, fill with global median
df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].median())

# Step 3: Save
print("✅ Missing after imputation:", df_cleaned[col].isnull().sum())
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)


✅ Missing after imputation: 0


In [None]:


import numpy as np

# Step A: Define a function to extract region
def extract_region(location):
    if pd.isnull(location): return np.nan
    parts = location.split(",")
    if len(parts) == 3:
        return parts[1].strip()  # Extract Province/State
    return np.nan

# Step B: Apply extraction
df_cleaned["Extracted Region"] = df_cleaned["Headquarters Location"].apply(extract_region)

# Step C: Fill missing values
df_cleaned["Headquarters Regions"] = df_cleaned["Headquarters Regions"].fillna(df_cleaned["Extracted Region"])

# Step D: Still missing? Fill with 'Unknown'
df_cleaned["Headquarters Regions"] = df_cleaned["Headquarters Regions"].fillna("Unknown")

# Step E: Drop helper column
df_cleaned.drop(columns=["Extracted Region"], inplace=True)

# Step F: Save
print("✅ Missing after imputation:", df_cleaned["Headquarters Regions"].isnull().sum())
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)


✅ Missing after imputation: 0


In [None]:


from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder

target_col = "Number of Funding Rounds"
features = [
    "Total Funding Amount",
    "Last Funding Amount",
    "Estimated Revenue Range",
    "Funding Status",
    "Industries"
]

# Step A: Prepare Encoder
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
df_encoded = df_cleaned.copy()
df_encoded[["Estimated Revenue Range", "Funding Status", "Industries"]] = encoder.fit_transform(
    df_encoded[["Estimated Revenue Range", "Funding Status", "Industries"]]
)

# Step B: Train model
train_df = df_encoded.dropna(subset=features + [target_col])
X_train = train_df[features]
y_train = train_df[target_col]

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step C: Predict
predict_df = df_encoded[df_cleaned[target_col].isnull()]
X_pred = predict_df[features]
y_pred = model.predict(X_pred)

# Step D: Impute
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = y_pred

# Step E: Save
print("✅ Missing after imputation:", df_cleaned[target_col].isnull().sum())
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)


✅ Missing after imputation: 0


In [None]:
print(df_cleaned["Last Funding Type"].dropna().unique())


['Grant' 'Seed' 'Initial Coin Offering' 'Product Crowdfunding' 'Pre-Seed'
 'Venture - Series Unknown' 'Series A' 'Undisclosed' 'Equity Crowdfunding']


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder

target_col = "Last Funding Type"
features = [
    "Funding Status",
    "Estimated Revenue Range",
    "Total Funding Amount",
    "Number of Funding Rounds",
    "Industries",
    "Last Funding Amount"
]

# Step A: Ordinal Encode categorical features
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
df_encoded = df_cleaned.copy()
df_encoded[["Funding Status", "Estimated Revenue Range", "Industries"]] = encoder.fit_transform(
    df_encoded[["Funding Status", "Estimated Revenue Range", "Industries"]]
)

# Step B: Label encode the target
label_mapping = {label: idx for idx, label in enumerate(df_encoded[target_col].dropna().unique())}
inv_label_mapping = {v: k for k, v in label_mapping.items()}
df_encoded[target_col] = df_encoded[target_col].map(label_mapping)

# Step C: Prepare training and prediction sets
train_df = df_encoded.dropna(subset=features + [target_col])
X_train = train_df[features]
y_train = train_df[target_col]

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict for missing
predict_df = df_encoded[df_cleaned[target_col].isnull()]
X_pred = predict_df[features]
y_pred = clf.predict(X_pred)

# Step D: Impute predicted values
predicted_labels = [inv_label_mapping[int(label)] for label in y_pred]
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = predicted_labels

# Step E: Save
print("✅ Missing after imputation:", df_cleaned[target_col].isnull().sum())
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)


✅ Missing after imputation: 0


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder

target_col = "SEMrush - Monthly Visits"
features = [
    "Estimated Revenue Range",
    "Number of Employees",
    "Industries",
    "Last Funding Amount"
]

# Encode categorical features
df_encoded = df_cleaned.copy()
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
df_encoded[["Estimated Revenue Range", "Industries"]] = encoder.fit_transform(
    df_encoded[["Estimated Revenue Range", "Industries"]]
)

# Drop rows without any of the features
train_df = df_encoded.dropna(subset=features + [target_col])
X_train = train_df[features]
y_train = train_df[target_col]

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict for missing
predict_df = df_encoded[df_cleaned[target_col].isnull()]
X_pred = predict_df[features]
y_pred = model.predict(X_pred)

# Impute values
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = y_pred

# Save and confirm
print("✅ Missing after imputation:", df_cleaned[target_col].isnull().sum())
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)


✅ Missing after imputation: 0


In [None]:
# Target column
target_col = "SEMrush - Page Views / Visit"
features = [
    "Estimated Revenue Range",
    "Industries",
    "Number of Employees",
    "SEMrush - Monthly Visits"
]

# Encode categorical columns
df_encoded = df_cleaned.copy()
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
df_encoded[["Estimated Revenue Range", "Industries"]] = encoder.fit_transform(
    df_encoded[["Estimated Revenue Range", "Industries"]]
)

# Drop rows with missing feature or target
train_df = df_encoded.dropna(subset=features + [target_col])
X_train = train_df[features]
y_train = train_df[target_col]

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict missing
predict_df = df_encoded[df_cleaned[target_col].isnull()]
X_pred = predict_df[features]
y_pred = model.predict(X_pred)

# Fill missing
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = y_pred

# Save and confirm
print("✅ Missing after imputation:", df_cleaned[target_col].isnull().sum())
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)


✅ Missing after imputation: 0


In [None]:
# Target
target_col = "SEMrush - Visit Duration"
features = [
    "SEMrush - Page Views / Visit",
    "SEMrush - Monthly Visits",
    "Estimated Revenue Range",
    "Industries"
]

# Ordinal encode
df_encoded = df_cleaned.copy()
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
df_encoded[["Estimated Revenue Range", "Industries"]] = encoder.fit_transform(
    df_encoded[["Estimated Revenue Range", "Industries"]]
)

# Training data
train_df = df_encoded.dropna(subset=features + [target_col])
X_train = train_df[features]
y_train = train_df[target_col]

# Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict missing
predict_df = df_encoded[df_cleaned[target_col].isnull()]
X_pred = predict_df[features]
y_pred = model.predict(X_pred)

# Impute
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = y_pred

# Save
print("✅ Missing after imputation:", df_cleaned[target_col].isnull().sum())
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)


✅ Missing after imputation: 0


In [None]:
# Target column
target_col = "SEMrush - Bounce Rate"

# Step A: Preprocess percentage column
df_cleaned[target_col] = df_cleaned[target_col].apply(
    lambda x: float(str(x).replace('%', '')) if pd.notnull(x) else np.nan
)

# Features for prediction
features = [
    "SEMrush - Visit Duration",
    "SEMrush - Page Views / Visit",
    "Estimated Revenue Range",
    "Industries"
]

# Encode categoricals
df_encoded = df_cleaned.copy()
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
df_encoded[["Estimated Revenue Range", "Industries"]] = encoder.fit_transform(
    df_encoded[["Estimated Revenue Range", "Industries"]]
)

# Train data
train_df = df_encoded.dropna(subset=features + [target_col])
X_train = train_df[features]
y_train = train_df[target_col]

# Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
predict_df = df_encoded[df_cleaned[target_col].isnull()]
X_pred = predict_df[features]
y_pred = model.predict(X_pred)

# Fill
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = y_pred

# Optional: Reconvert to percentage string if needed
# df_cleaned[target_col] = df_cleaned[target_col].apply(lambda x: f"{x:.2f}%" if pd.notnull(x) else x)

# Save
print("✅ Missing after imputation:", df_cleaned[target_col].isnull().sum())
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)


✅ Missing after imputation: 0


In [None]:
print([col for col in df_cleaned.columns if "SEMrush" in col])


['SEMrush - Monthly Visits', 'SEMrush - Visit Duration', 'SEMrush - Page Views / Visit', 'SEMrush - Bounce Rate']


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import pandas as pd

# Target column
target_col = "Apptopia - Downloads Last 30 Days"

# Features
features = [
    "Apptopia - Number of Apps",
    "Estimated Revenue Range",
    "Industries",
    "Number of Employees",
    "Funding Status"
]

# Step 1: Encode categoricals
df_encoded = df_cleaned.copy()
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
df_encoded[["Estimated Revenue Range", "Industries", "Funding Status"]] = encoder.fit_transform(
    df_encoded[["Estimated Revenue Range", "Industries", "Funding Status"]]
)

# Step 2: Clean target (remove commas, convert to float)
df_encoded[target_col] = df_encoded[target_col].astype(str).str.replace(",", "")
df_encoded[target_col] = pd.to_numeric(df_encoded[target_col], errors="coerce")

# Step 3: Filter training data
train_df = df_encoded.dropna(subset=features + [target_col])
X_train = train_df[features]
y_train = train_df[target_col]

# Step 4: Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 5: Predict and fill missing values
predict_df = df_encoded[df_cleaned[target_col].isnull()]
X_pred = predict_df[features]
y_pred = model.predict(X_pred)
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = y_pred

# Step 6: Save
print("✅ Missing after imputation:", df_cleaned[target_col].isnull().sum())
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)


✅ Missing after imputation: 0


In [None]:
print(df_cleaned["Apptopia - Number of Apps"].dropna().unique())


[133.   2.   1.  15.   3.]


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# 📂 Copy original cleaned dataset
df_encoded = df_cleaned.copy()

# 🎯 Target column to impute
target_col = "Apptopia - Number of Apps"

# 🔍 Features to use
features = [
    "Estimated Revenue Range",
    "Industries",
    "Funding Status",
    "Apptopia - Downloads Last 30 Days",
    "Number of Employees"
]

# 🧼 Convert target column to numeric (remove commas, convert to float)
df_encoded[target_col] = pd.to_numeric(df_encoded[target_col].astype(str).str.replace(",", ""), errors='coerce')

# 🧼 Also clean any numeric-looking features if needed
df_encoded["Apptopia - Downloads Last 30 Days"] = pd.to_numeric(
    df_encoded["Apptopia - Downloads Last 30 Days"].astype(str).str.replace(",", ""), errors='coerce'
)

# 🛑 Filter rows for training
train_df = df_encoded.dropna(subset=features + [target_col])
predict_df = df_encoded[df_encoded[target_col].isnull()]

# 🧠 Categorical features
cat_features = ["Estimated Revenue Range", "Industries", "Funding Status"]

# 🔧 Fill missing and ensure strings
for col in cat_features:
    train_df.loc[:, col] = train_df[col].astype(str).fillna("Unknown")
    predict_df.loc[:, col] = predict_df[col].astype(str).fillna("Unknown")

# 🧠 Encode categorical features
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train_df.loc[:, cat_features] = encoder.fit_transform(train_df[cat_features])
predict_df.loc[:, cat_features] = encoder.transform(predict_df[cat_features])

# 🎯 Prepare training and prediction sets
X_train = train_df[features]
y_train = train_df[target_col].astype(float)
X_pred = predict_df[features]

# 🤖 Train Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 🔮 Predict missing values
y_pred = model.predict(X_pred)

# 💾 Fill back into main DataFrame
df_encoded.loc[df_encoded[target_col].isnull(), target_col] = y_pred

# ✅ Update cleaned dataframe
df_cleaned[target_col] = df_encoded[target_col]

# 💾 Save to CSV
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)

# ✅ Final confirmation
print(f"✅ Missing after imputation: {df_cleaned[target_col].isnull().sum()}")


✅ Missing after imputation: 0


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

# 🛠️ Configuration
target_col = "SEMrush - Monthly Visits"   # change this to your target column
cat_features = ["Industry Groups", "Estimated Revenue Range", "Funding Status"]  # adjust as needed

# 🔍 Filter rows for training (non-null target)
train_df = df_cleaned[df_cleaned[target_col].notnull()].copy()
predict_df = df_cleaned[df_cleaned[target_col].isnull()].copy()

# 🧼 Clean target column (remove %, commas, convert to float)
def clean_numeric_col(col):
    return (
        col.astype(str)
           .str.replace(",", "", regex=False)
           .str.replace("%", "", regex=False)
           .replace("nan", pd.NA)
           .astype(float)
    )

train_df[target_col] = clean_numeric_col(train_df[target_col])
df_cleaned[target_col] = clean_numeric_col(df_cleaned[target_col])

# 🔧 Fill missing and convert cat features to string for encoding
for col in cat_features:
    train_df[col] = train_df[col].astype(str).fillna("Unknown")
    predict_df[col] = predict_df[col].astype(str).fillna("Unknown")

# 🎯 Define features
features = cat_features

# 🔢 Encode categorical features safely
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train_df.loc[:, cat_features] = encoder.fit_transform(train_df[cat_features])

if not predict_df.empty:
    predict_df.loc[:, cat_features] = encoder.transform(predict_df[cat_features])

# ✅ Model training
X_train = train_df[features]
y_train = train_df[target_col]
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 🔮 Prediction and imputation
if not predict_df.empty:
    X_pred = predict_df[features]
    y_pred = model.predict(X_pred)
    df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = y_pred

# ✅ Final check
print(f"✅ Missing after imputation: {df_cleaned[target_col].isnull().sum()}")


✅ Missing after imputation: 0


In [None]:
# 🧼 Check missing values count and percentage
missing_info = (
    df_cleaned.isnull().sum()
    .to_frame("Missing Count")
    .assign(
        Missing_Percentage=lambda x: (x["Missing Count"] / len(df_cleaned)) * 100
    )
    .sort_values(by="Missing Count", ascending=False)
)

# 🖨️ Show only columns with missing values
missing_info = missing_info[missing_info["Missing Count"] > 0]
print(missing_info)


                                  Missing Count  Missing_Percentage
Actively Hiring                              49          100.000000
Number of Exits                              49          100.000000
Valuation at IPO                             49          100.000000
Price                                        49          100.000000
IPqwery - Patents Granted                    47           95.918367
Number of Investors                          39           79.591837
Number of Events                             38           77.551020
G2 Stack - Total Products Active             37           75.510204
Number of Articles                           30           61.224490


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder

# 🧹 Copy DataFrame to avoid modifying original
df_encoded = df_cleaned.copy()

# 🎯 Target column
target_col = "Number of Articles"

# 🧠 Features to predict the target
features = [
    "Number of Employees",
    "Total Funding Amount",
    "Estimated Revenue Range",
    "Industries",
    "Funding Status",
    "Number of Founders"
]

# 🧹 Clean target column: remove commas and convert to float
df_encoded[target_col] = df_encoded[target_col].astype(str).str.replace(",", "").replace("nan", pd.NA)
df_encoded[target_col] = pd.to_numeric(df_encoded[target_col], errors='coerce')

# 🎯 Drop rows with missing values in features or target (for training)
train_df = df_encoded.dropna(subset=features + [target_col])
predict_df = df_encoded[df_encoded[target_col].isnull()]

# 🧠 Categorical features
cat_features = [col for col in features if df_encoded[col].dtype == 'object']

# Step 1: Fill NaNs with 'Unknown' and convert to string for encoding
for col in cat_features:
    train_df.loc[:, col] = train_df[col].astype(str).fillna('Unknown')
    predict_df.loc[:, col] = predict_df[col].astype(str).fillna('Unknown')

# Step 2: Ordinal encoding
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train_df.loc[:, cat_features] = encoder.fit_transform(train_df[cat_features])
predict_df.loc[:, cat_features] = encoder.transform(predict_df[cat_features])

# Step 3: Model training and prediction
X_train = train_df[features]
y_train = train_df[target_col]
X_pred = predict_df[features]

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_pred)

# Step 4: Fill predicted values
df_encoded.loc[df_encoded[target_col].isnull(), target_col] = y_pred

# ✅ Update the main cleaned DataFrame
df_cleaned[target_col] = df_encoded[target_col]

# 💾 Save to CSV
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)

# ✅ Final check
print(f"✅ Missing after imputation: {df_cleaned[target_col].isnull().sum()}")


✅ Missing after imputation: 0


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

# ✅ Define target and features (REMOVE the missing column)
target_col = "Number of Investors"
features = [
    "Estimated Revenue Range",
    "Industries",
    "Funding Status",
    "Total Funding Amount",
    "Number of Funding Rounds",
    "Last Funding Amount"
]

# ✅ Work on a fresh copy
df_encoded = df_cleaned.copy()

# ✅ Handle numeric strings: remove commas and convert to float
for col in ["Total Funding Amount", "Last Funding Amount"]:
    df_encoded[col] = (
        df_encoded[col]
        .astype(str)
        .str.replace(",", "", regex=False)
        .replace("Unknown", np.nan)
        .astype(float)
    )

# ✅ Split training and prediction sets
train_df = df_encoded.dropna(subset=features + [target_col])
predict_df = df_encoded[df_encoded[target_col].isnull()]

# ✅ Identify and encode categorical features
cat_features = ["Estimated Revenue Range", "Industries", "Funding Status"]
for col in cat_features:
    train_df.loc[:, col] = train_df[col].astype(str).fillna("Unknown")
    predict_df.loc[:, col] = predict_df[col].astype(str).fillna("Unknown")

# ✅ Encode categorical columns
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
train_df.loc[:, cat_features] = encoder.fit_transform(train_df[cat_features])
predict_df.loc[:, cat_features] = encoder.transform(predict_df[cat_features])

# ✅ Prepare data for model
X_train = train_df[features]
y_train = train_df[target_col].astype(float)
X_pred = predict_df[features]

# ✅ Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ✅ Predict
y_pred = model.predict(X_pred)

# ✅ Fill predictions back
df_encoded.loc[df_encoded[target_col].isnull(), target_col] = y_pred
df_cleaned[target_col] = df_encoded[target_col]

# ✅ Save file
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)

# ✅ Summary
missing_after = df_cleaned[target_col].isnull().sum()
print(f"✅ Missing after imputation: {missing_after}")


✅ Missing after imputation: 0


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder

# 🧹 Step 1: Setup
df_encoded = df_cleaned.copy()
target_col = "G2 Stack - Total Products Active"

# 🧠 Step 2: Select features (add/remove based on your domain knowledge)
features = [
    "Estimated Revenue Range",
    "Industries",
    "Funding Status",
    "Number of Employees",
    "Total Funding Amount",
    "Apptopia - Number of Apps",
    "Apptopia - Downloads Last 30 Days"
]

# 🧪 Step 3: Drop rows with missing feature or target for training
train_df = df_encoded.dropna(subset=features + [target_col])
predict_df = df_encoded[df_encoded[target_col].isnull()]

# 🧼 Step 4: Identify categorical features to encode
cat_features = train_df[features].select_dtypes(include='object').columns.tolist()

# 💡 Step 5: Preprocess categorical values
for col in cat_features:
    train_df.loc[:, col] = train_df[col].astype(str).fillna('Unknown')
    predict_df.loc[:, col] = predict_df[col].astype(str).fillna('Unknown')


# 🚀 Step 6: Encode categorical columns
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train_df.loc[:, cat_features] = encoder.fit_transform(train_df[cat_features])
predict_df.loc[:, cat_features] = encoder.transform(predict_df[cat_features])


# 🧠 Step 7: Prepare train/predict sets
X_train = train_df[features]
y_train = train_df[target_col].astype(str).str.replace(",", "").astype(float)
X_pred = predict_df[features]

# 🏗️ Step 8: Train model and predict
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_pred)

# 💾 Step 9: Fill predicted values back into original dataframe
df_encoded.loc[df_encoded[target_col].isnull(), target_col] = y_pred
df_cleaned[target_col] = df_encoded[target_col]

# ✅ Step 10: Confirm
print(f"✅ Missing after imputation: {df_cleaned[target_col].isnull().sum()}")


✅ Missing after imputation: 0


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder

# Load the cleaned data
df_cleaned = pd.read_csv('/content/Canada_cleaned.csv')

target_col = "Actively Hiring"
features = [
    "Number of Employees",
    "Estimated Revenue Range",
    "Funding Status",
    "Company Type"
]

# Fill NaNs in feature columns temporarily for training
for col in features:
    if df_cleaned[col].dtype == 'object':
        df_cleaned[col] = df_cleaned[col].astype(str).fillna('Unknown')
    else:
        df_cleaned[col] = df_cleaned[col].fillna(-1)

# Split train/predict sets
train_df = df_cleaned[df_cleaned[target_col].notnull()].copy()
predict_df = df_cleaned[df_cleaned[target_col].isnull()].copy()

# If no training data or no prediction data, skip
if train_df.empty or predict_df.empty:
    print(f"ℹ️ Skipping {target_col} — train or predict set is empty.")
else:
    # Identify categorical features
    cat_features = [col for col in features if train_df[col].dtype == 'object']

    # Encode categorical variables
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    train_df[cat_features] = encoder.fit_transform(train_df[cat_features])
    predict_df[cat_features] = encoder.transform(predict_df[cat_features])

    # Train and predict
    X_train = train_df[features]
    y_train = train_df[target_col]
    X_pred = predict_df[features]

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_pred)

    # Update the main DataFrame
    df_cleaned.loc[predict_df.index, target_col] = y_pred

    print(f"✅ Missing after imputation for {target_col}: {df_cleaned[target_col].isnull().sum()}")

# Save CSV
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)


ℹ️ Skipping Actively Hiring — train or predict set is empty.


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder

# Load the cleaned data
df_cleaned = pd.read_csv('/content/Canada_cleaned.csv')

target_col = "Actively Hiring"
features = [
    "Number of Employees",
    "Estimated Revenue Range",
    "Funding Status",
    "Company Type"
]

df_encoded = df_cleaned.copy()

# Identify categorical features
cat_features = [col for col in features if df_encoded[col].dtype == 'object']

# Split train/predict sets
train_df = df_encoded.dropna(subset=[target_col]).copy()
predict_df = df_encoded[df_encoded[target_col].isnull()].copy()

# Only run if both sets have rows
if train_df.empty or predict_df.empty:
    print(f"ℹ️ Skipping {target_col} — No missing values or no training data.")
else:
    # Fill missing feature values before encoding
    for col in features:
        if col in cat_features:
            train_df[col] = train_df[col].astype(str).fillna('Unknown')
            predict_df[col] = predict_df[col].astype(str).fillna('Unknown')
        else:
            train_df[col] = train_df[col].fillna(-1)
            predict_df[col] = predict_df[col].fillna(-1)

    # Encode categorical variables
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    train_df[cat_features] = encoder.fit_transform(train_df[cat_features])
    predict_df[cat_features] = encoder.transform(predict_df[cat_features])

    # Train and predict
    X_train = train_df[features]
    y_train = train_df[target_col]
    X_pred = predict_df[features]

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_pred)

    # Update missing values
    df_cleaned.loc[predict_df.index, target_col] = y_pred

    print(f"✅ Missing after imputation for {target_col}: {df_cleaned[target_col].isnull().sum()}")

# Save final CSV
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)


ℹ️ Skipping Actively Hiring — No missing values or no training data.


In [None]:
col_name = "Actively Hiring"  # change as needed
print(f"Unique count: {df_cleaned[col_name].nunique(dropna=False)}")
print("Sample values:", df_cleaned[col_name].unique())


Unique count: 1
Sample values: [nan]


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

# Load the cleaned data
df_cleaned = pd.read_csv('/content/Canada_cleaned.csv')

# --- Setup for Imputation ---
target_col = "Price"
features = [
    "Total Funding Amount",
    "Valuation at IPO",
    "Number of Employees",
    "Funding Status",
    "Industries"
]

# Create a copy for encoding
df_encoded = df_cleaned.copy()

# Filter for rows where a Price might exist (e.g., acquisitions, IPOs)
price_relevant_df = df_encoded[df_encoded['Funding Status'].isin(['M&A', 'IPO', 'Private Equity'])].copy()

# --- Preprocessing and Encoding ---
# Ensure numeric features are floats
for col in ["Total Funding Amount", "Valuation at IPO", "Number of Employees"]:
    price_relevant_df[col] = pd.to_numeric(price_relevant_df[col], errors='coerce')

# Identify categorical features
cat_features = [col for col in features if price_relevant_df[col].dtype == 'object']

# Split into training and prediction sets within the filtered data
train_df = price_relevant_df.dropna(subset=features + [target_col])
predict_df = price_relevant_df[price_relevant_df[target_col].isnull()]

if len(train_df) < 5:
    print(f"⚠️ Not enough training data for {target_col}. Imputation skipped.")
else:
    # Handle 'nan' values in categorical features for encoding
    for col in cat_features:
        train_df.loc[:, col] = train_df[col].astype(str).fillna('Unknown')
        predict_df.loc[:, col] = predict_df[col].astype(str).fillna('Unknown')

    # Use OrdinalEncoder to transform categorical features
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    train_df.loc[:, cat_features] = encoder.fit_transform(train_df[cat_features])
    predict_df.loc[:, cat_features] = encoder.transform(predict_df[cat_features])

    # --- Train and Predict ---
    X_train = train_df[features]
    y_train = train_df[target_col]
    X_pred = predict_df[features]

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_pred)

    # --- Impute and Save ---
    df_cleaned.loc[predict_df.index, target_col] = y_pred

# Save the final cleaned dataframe
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)

print(f"✅ Missing after imputation for {target_col}: {df_cleaned[target_col].isnull().sum()}")

⚠️ Not enough training data for Price. Imputation skipped.
✅ Missing after imputation for Price: 49


In [None]:

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

# Load the cleaned data
df_cleaned = pd.read_csv('/content/Canada_cleaned.csv')

# --- Setup for Imputation ---
target_col = "Number of Events"
features = [
    "Number of Employees",
    "Number of Articles",
    "Number of Funding Rounds",
    "Estimated Revenue Range",
    "Industries"
]

# Create a copy for encoding
df_encoded = df_cleaned.copy()

# --- Preprocessing and Encoding ---
# Ensure numeric features are floats
for col in ["Number of Employees", "Number of Articles", "Number of Funding Rounds"]:
    df_encoded[col] = pd.to_numeric(df_encoded[col], errors='coerce')

# Identify categorical features
cat_features = [col for col in features if df_encoded[col].dtype == 'object']

# Prepare training data: remove rows where target is missing
train_df = df_encoded.dropna(subset=features + [target_col])
predict_df = df_encoded[df_encoded[target_col].isnull()]

# Handle 'nan' values in categorical features for encoding
for col in cat_features:
    train_df.loc[:, col] = train_df[col].astype(str).fillna('Unknown')
    predict_df.loc[:, col] = predict_df[col].astype(str).fillna('Unknown')

# Use OrdinalEncoder to transform categorical features
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train_df.loc[:, cat_features] = encoder.fit_transform(train_df[cat_features])
predict_df.loc[:, cat_features] = encoder.transform(predict_df[cat_features])

# --- Train and Predict ---
X_train = train_df[features]
y_train = train_df[target_col]
X_pred = predict_df[features]

# Train a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict the missing values
y_pred = model.predict(X_pred)

# --- Impute and Save ---
df_cleaned.loc[df_cleaned[target_col].isnull(), target_col] = np.round(y_pred)
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)

print(f"✅ Missing after imputation for {target_col}: {df_cleaned[target_col].isnull().sum()}")

✅ Missing after imputation for Number of Events: 0


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

# Load the cleaned data
df_cleaned = pd.read_csv('/content/Canada_cleaned.csv')

# --- Step A: Clean Valuation column ---
target_col = "Valuation at IPO"
df_cleaned[target_col] = pd.to_numeric(
    df_cleaned[target_col].astype(str).str.replace(",", ""), errors='coerce'
)

# --- Step B: Focus only on IPO companies ---
ipo_df = df_cleaned[df_cleaned["Funding Status"] == "IPO"].copy()

# --- Step C: Prepare Data and Features ---
features = [
    "Total Funding Amount",
    "Last Funding Amount",
    "Estimated Revenue Range",
    "CB Rank (Company)",
    "Number of Employees",
    "Industries"
]
cat_features = ["Estimated Revenue Range", "Industries"]

train_df = ipo_df.dropna(subset=features + [target_col])
test_df = ipo_df[ipo_df[target_col].isnull()]

if len(train_df) < 5:
    print("⚠️ Not enough IPO rows with real valuations. Skipping imputation.")
elif test_df.empty:
    print(f"ℹ️ No missing values found for {target_col}. Nothing to predict.")
else:
    # Handle NaNs in categorical features
    for col in cat_features:
        train_df.loc[:, col] = train_df[col].astype(str).fillna('Unknown')
        test_df.loc[:, col] = test_df[col].astype(str).fillna('Unknown')

    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    train_df.loc[:, cat_features] = encoder.fit_transform(train_df[cat_features])
    test_df.loc[:, cat_features] = encoder.transform(test_df[cat_features])

    X_train = train_df[features]
    y_train = train_df[target_col]
    X_test = test_df[features]

    # --- Step D: Train model ---
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # --- Step E: Predict and insert ---
    y_pred = model.predict(X_test)
    df_cleaned.loc[test_df.index, target_col] = y_pred
    print(f"✅ Imputed {len(test_df)} rows for {target_col}")

# Save file
df_cleaned.to_csv('/content/Canada_cleaned.csv', index=False)
print(f"✅ Missing after imputation for {target_col}: {df_cleaned[target_col].isnull().sum()}")


⚠️ Not enough IPO rows with real valuations. Skipping imputation.
✅ Missing after imputation for Valuation at IPO: 49


In [None]:
cols_to_check = [
    "Price",
    "Valuation at IPO",
    "IPqwery - Patents Granted",
    "G2 Stack - Total Products Active"
]

for col in cols_to_check:
    print(f"\n--- {col} ---")
    print("Unique count:", df_cleaned[col].nunique(dropna=False))
    print("Sample values:")
    print(df_cleaned[col].dropna().unique()[:20])  # first 20 unique non-null values




--- Price ---
Unique count: 1
Sample values:
[]

--- Valuation at IPO ---
Unique count: 1
Sample values:
[]

--- IPqwery - Patents Granted ---
Unique count: 2
Sample values:
[0.]

--- G2 Stack - Total Products Active ---
Unique count: 11
Sample values:
[18.  6. 24. 25. 14. 11.  4. 29. 16. 17.]


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

# Load data
df_cleaned = pd.read_csv('/content/Canada_cleaned.csv')

# List of numeric target columns to impute
target_cols = [
    "Price",
    "Valuation at IPO",
    "IPqwery - Patents Granted",
    "G2 Stack - Total Products Active"
]

# Features we can use for prediction (choose generic ones available for all rows)
base_features = [
    "Total Funding Amount",
    "Last Funding Amount",
    "Estimated Revenue Range",
    "CB Rank (Company)",
    "Number of Employees",
    "Industries"
]

cat_features = ["Estimated Revenue Range", "Industries"]

for target_col in target_cols:
    print(f"\n🚀 Processing {target_col} ...")

    # Step 1: Force numeric
    df_cleaned[target_col] = pd.to_numeric(
        df_cleaned[target_col].astype(str).str.replace(",", "").str.strip(),
        errors='coerce'
    )

    # Step 2: Train/predict split
    train_df = df_cleaned.dropna(subset=base_features + [target_col]).copy()
    test_df = df_cleaned[df_cleaned[target_col].isnull()].copy()

    if len(train_df) < 5:
        print(f"⚠️ Skipping {target_col} — not enough rows to train ({len(train_df)} available).")
        continue

    # Step 3: Handle categoricals
    for col in cat_features:
        train_df.loc[:, col] = train_df[col].astype(str).fillna("Unknown")
        test_df.loc[:, col] = test_df[col].astype(str).fillna("Unknown")

    encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    train_df.loc[:, cat_features] = encoder.fit_transform(train_df[cat_features])
    test_df.loc[:, cat_features] = encoder.transform(test_df[cat_features])

    # Step 4: Train model
    X_train = train_df[base_features]
    y_train = train_df[target_col]
    X_test = test_df[base_features]

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Step 5: Predict and insert
    preds = model.predict(X_test)
    df_cleaned.loc[test_df.index, target_col] = preds

    print(f"✅ Missing after imputation for {target_col}: {df_cleaned[target_col].isnull().sum()}")

# Save results
df_cleaned.to_csv("/content/Canada_cleaned.csv", index=False)
print("\n💾 All imputations complete and saved to /content/Canada_cleaned.csv")



🚀 Processing Price ...
⚠️ Skipping Price — not enough rows to train (0 available).

🚀 Processing Valuation at IPO ...
⚠️ Skipping Valuation at IPO — not enough rows to train (0 available).

🚀 Processing IPqwery - Patents Granted ...
⚠️ Skipping IPqwery - Patents Granted — not enough rows to train (2 available).

🚀 Processing G2 Stack - Total Products Active ...
✅ Missing after imputation for G2 Stack - Total Products Active: 0

💾 All imputations complete and saved to /content/Canada_cleaned.csv


In [None]:
# 🧼 Check missing values count and percentage
missing_info = (
    df_cleaned.isnull().sum()
    .to_frame("Missing Count")
    .assign(
        Missing_Percentage=lambda x: (x["Missing Count"] / len(df_cleaned)) * 100
    )
    .sort_values(by="Missing Count", ascending=False)
)

# 🖨️ Show only columns with missing values
missing_info = missing_info[missing_info["Missing Count"] > 0]
print(missing_info)



                           Missing Count  Missing_Percentage
Actively Hiring                       49          100.000000
Number of Exits                       49          100.000000
Valuation at IPO                      49          100.000000
Price                                 49          100.000000
IPqwery - Patents Granted             47           95.918367


In [None]:
import pandas as pd

# Replace 'your_file.csv' with the actual path to your CSV file
file_path = '/content/Canada_cleaned.csv'

# Read the CSV file into a pandas DataFrame
try:
    df = pd.read_csv(file_path)

    # Print the column names
    print("Column names in the CSV file:")
    for col in df.columns:
        print(col)

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Column names in the CSV file:
Founded Date
Number of Founders
Company Type
Number of Employees
Industries
Headquarters Location
Headquarters Regions
Number of Investors
Actively Hiring
Number of Funding Rounds
Last Funding Amount
Funding Status
Last Funding Type
Estimated Revenue Range
IPqwery - Patents Granted
SEMrush - Monthly Visits
SEMrush - Visit Duration
SEMrush - Page Views / Visit
SEMrush - Bounce Rate
Number of Events
BuiltWith - Active Tech Count
G2 Stack - Total Products Active
Number of Articles
CB Rank (Company)
Total Funding Amount
Valuation at IPO
Price
Number of Exits
Industry Groups
Apptopia - Downloads Last 30 Days
Apptopia - Number of Apps


In [None]:
# import pandas as pd

# # Load cleaned CSV
# file_path = '/content/Canada_cleaned.csv'
# df = pd.read_csv(file_path)

# # Columns to keep
# columns_to_keep = [
#     'Founded Date', 'Number of Founders', 'Number of Employees', 'Industries',
#     'Headquarters Location', 'Headquarters Regions', 'Number of Investors',
#     'Actively Hiring', 'Number of Funding Rounds', 'Last Funding Amount',
#     'Funding Status', 'Last Funding Type', 'Estimated Revenue Range',
#     'Number of Events', 'BuiltWith - Active Tech Count', 'Total Funding Amount'
# ]

# # Keep only the required columns
# df = df[columns_to_keep]

# # Save back to the same file
# df.to_csv(file_path, index=False)

# print(f"✅ File saved with only {len(columns_to_keep)} columns at {file_path}")


In [None]:
# numeric_cols_to_round = [
#         "Number of Founders", "Number of Employees", "Number of Investors",
#         "Number of Funding Rounds", "IPqwery - Patents Granted",
#         "SEMrush - Monthly Visits", "SEMrush - Visit Duration", "SEMrush - Page Views / Visit",
#         "SEMrush - Bounce Rate", "Number of Events", "BuiltWith - Active Tech Count",
#         "G2 Stack - Total Products Active", "Number of Articles",
#         "Number of Exits", "Apptopia - Downloads Last 30 Days", "Apptopia - Number of Apps"
#     ]

# for col in numeric_cols_to_round:
#     if col in df_cleaned.columns:
#         # Check if the column is of a float type before rounding
#         if pd.api.types.is_numeric_dtype(df_cleaned[col]):
#             df_cleaned[col] = df_cleaned[col].round(0).astype('Int64')
# print("✅ Rounded specified numeric columns to the nearest whole number.")

In [None]:
# # 🧼 Check missing values count and percentage
# missing_info = (
#     df_cleaned.isnull().sum()
#     .to_frame("Missing Count")
#     .assign(
#         Missing_Percentage=lambda x: (x["Missing Count"] / len(df_cleaned)) * 100
#     )
#     .sort_values(by="Missing Count", ascending=False)
# )

# # 🖨️ Show only columns with missing values
# missing_info = missing_info[missing_info["Missing Count"] > 0]
# print(missing_info)

