### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

### Final Model (Logistic Regression Classifier Model)

In [2]:
# Prepare the Data
existing_df_3 = pd.read_csv("Datasets/Existing_Companies.csv", encoding='unicode_escape')
existing_unicorn_data = existing_df_3[['Country', 'Industry']]

# Encoding Country
enc_country = OneHotEncoder()
country_encoded = enc_country.fit_transform(existing_unicorn_data[['Country']]).toarray()
country_encoded_df = pd.DataFrame(country_encoded, columns=enc_country.get_feature_names_out(['Country']))

# Encoding Industry
enc_industry = OneHotEncoder()
industry_encoded = enc_industry.fit_transform(existing_unicorn_data[['Industry']]).toarray()
industry_encoded_df = pd.DataFrame(industry_encoded, columns=enc_industry.get_feature_names_out(['Industry']))

# Combine encoded country and industry into one dataset
existing_unicorn_data = pd.concat([country_encoded_df, industry_encoded_df], axis=1)

# Analyze the distribution of industries and countries
industry_distribution = existing_unicorn_data.filter(like='Industry_').sum().to_dict()
country_distribution = existing_unicorn_data.filter(like='Country_').sum().to_dict()

# Define industry_country_counts
industry_country_counts = {}
for industry in industry_distribution:
    industry_country_counts[industry] = {}
    for country in country_distribution:
        industry_country_counts[industry][country] = existing_unicorn_data[(existing_unicorn_data[industry] == 1) & (existing_unicorn_data[country] == 1)].shape[0]

# Define a function to assign likelihood labels dynamically based on distribution
def assign_likelihood_dynamic(row, industry_country_counts):
    likelihood = 0  # Default: low chance

    # Check each combination of industry and country
    for industry, country_count_dict in industry_country_counts.items():
        if row[industry] == 1:
            for country, count in country_count_dict.items():
                if row[country] == 1:
                    if count > 40:  # Example threshold for high chance
                        likelihood = 2  # High chance
                    elif count > 5:  # Example threshold for medium chance
                        likelihood = 1  # Medium chance
                    return likelihood  # If combination found, return likelihood

    return likelihood

# Apply the function to assign likelihood labels
existing_unicorn_data['Likelihood'] = existing_unicorn_data.apply(assign_likelihood_dynamic, axis=1, industry_country_counts=industry_country_counts)

# Train the Models
X = existing_unicorn_data.drop('Likelihood', axis=1)
y = existing_unicorn_data['Likelihood']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Logistic Regression classifier
clf_lr = LogisticRegression(random_state=42)
clf_lr.fit(X_train, y_train)

# Predict using the Logistic Regression classifier
y_pred_lr = clf_lr.predict(X_test)

# Calculate Metrics
# accuracy = accuracy_score(y_test, y_pred_lr)
f1 = f1_score(y_test, y_pred_lr, average='weighted')
recall = recall_score(y_test, y_pred_lr, average='weighted')
precision = precision_score(y_test, y_pred_lr, average='weighted')

# Print Metrics
print("Metrics for Logistic Regression Classifier:")
# print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Metrics for Logistic Regression Classifier:
F1 Score: 0.9464561273698677
Recall: 0.9462616822429907
Precision: 0.9468558405474292


### Export the existing_unicorn_data dataframe

In [3]:
existing_unicorn_data.to_excel("Clusters/Likelihood.xlsx", index=False)

### Checking the created model with unseen data

In [4]:
emerging_df_3 = pd.read_csv("Datasets/Emerging_Companies.csv", encoding='unicode_escape')

In [5]:
emerging_df_3

Unnamed: 0,Company,Post Money Value,Total Equity Funding,Lead Investors Include,Country,Continent,Industry
0,Nxtra Data,986,237,The Carlyle Group,India,Asia,Enterprise Tech
1,Butternut Box,985,467,General Atlantic,United Kingdom,Europe,Consumer & Retail
2,Delphix,981,120,Fidelity,United States,North America,Enterprise Tech
3,Bizongo,980,290,Schroder Adveq,India,Asia,Consumer & Retail
4,OpenStore,970,137,Lux Capital,United States,North America,Consumer & Retail
...,...,...,...,...,...,...,...
378,Ursa Major,500,272,Explorer 1 Fund,United States,North America,Enterprise Tech
379,Chia Network,500,70,Multiple Lead Investors,United States,North America,Financial Services
380,Overtime,500,215,Multiple Lead Investors,United States,North America,Media & Entertainment
381,Vox Media,500,408,Penske Media Corporation,United States,North America,Media & Entertainment


In [6]:
unseen_data = emerging_df_3[['Country', 'Industry']]

In [7]:
unseen_data

Unnamed: 0,Country,Industry
0,India,Enterprise Tech
1,United Kingdom,Consumer & Retail
2,United States,Enterprise Tech
3,India,Consumer & Retail
4,United States,Consumer & Retail
...,...,...
378,United States,Enterprise Tech
379,United States,Financial Services
380,United States,Media & Entertainment
381,United States,Media & Entertainment


Check Industry "Enterprise Tech" with "Brazil", "Israel" and "United States" to see the difference.

Check Industry "Agricultural" with "Canada" and "United States" to see the difference.

In [8]:
# Assuming you already have your trained model clf_lr and encoders for country and industry
# enc_country and enc_industry respectively

unseen_data = np.array([["Israel", "Enterprise Tech"]])

# Perform one-hot encoding for the unseen data
encoded_country = enc_country.transform(unseen_data[:, 0].reshape(-1, 1)).toarray()
encoded_industry = enc_industry.transform(unseen_data[:, 1].reshape(-1, 1)).toarray()

# Concatenate encoded features
encoded_features = np.concatenate((encoded_country, encoded_industry), axis=1)

# Make predictions
y_pred_lr = clf_lr.predict(encoded_features)
print(y_pred_lr)

# Map predicted values to output messages
output_messages = {
    0: "There is a LOW Chance to be a Unicorn Company",
    1: "There is a MEDIUM Chance to be a Unicorn Company",
    2: "There is a HIGH Chance to be a Unicorn Company"
}

# Output the corresponding message for each predicted value
for prediction in y_pred_lr:
    print(output_messages[prediction])

[1]
There is a MEDIUM Chance to be a Unicorn Company


