### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

### Logistic Regression classifier

In [5]:
# Prepare the Data
existing_df_3 = pd.read_csv("Datasets/Existing_Companies.csv", encoding='unicode_escape')
existing_unicorn_data = existing_df_3[['Country', 'Industry']]

# Encoding Country
enc_country = OneHotEncoder()
country_encoded = enc_country.fit_transform(existing_unicorn_data[['Country']]).toarray()
country_encoded_df = pd.DataFrame(country_encoded, columns=enc_country.get_feature_names_out(['Country']))

# Encoding Industry
enc_industry = OneHotEncoder()
industry_encoded = enc_industry.fit_transform(existing_unicorn_data[['Industry']]).toarray()
industry_encoded_df = pd.DataFrame(industry_encoded, columns=enc_industry.get_feature_names_out(['Industry']))

# Combine encoded country and industry into one dataset
existing_unicorn_data = pd.concat([country_encoded_df, industry_encoded_df], axis=1)

# Analyze the distribution of industries and countries
industry_distribution = existing_unicorn_data.filter(like='Industry_').sum().to_dict()
country_distribution = existing_unicorn_data.filter(like='Country_').sum().to_dict()

# Define industry_country_counts
industry_country_counts = {}
for industry in industry_distribution:
    industry_country_counts[industry] = {}
    for country in country_distribution:
        industry_country_counts[industry][country] = existing_unicorn_data[(existing_unicorn_data[industry] == 1) & (existing_unicorn_data[country] == 1)].shape[0]

# Define a function to assign likelihood labels dynamically based on distribution
def assign_likelihood_dynamic(row, industry_country_counts):
    likelihood = 0  # Default: low chance

    # Check each combination of industry and country
    for industry, country_count_dict in industry_country_counts.items():
        if row[industry] == 1:
            for country, count in country_count_dict.items():
                if row[country] == 1:
                    if count > 35:  # Example threshold for high chance
                        likelihood = 2  # High chance
                    elif count > 10:  # Example threshold for medium chance
                        likelihood = 1  # Medium chance
                    return likelihood  # If combination found, return likelihood

    return likelihood

# Apply the function to assign likelihood labels
existing_unicorn_data['Likelihood'] = existing_unicorn_data.apply(assign_likelihood_dynamic, axis=1, industry_country_counts=industry_country_counts)

# Train the Models
X = existing_unicorn_data.drop('Likelihood', axis=1)
y = existing_unicorn_data['Likelihood']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Logistic Regression classifier
clf_lr = LogisticRegression(random_state=42)
clf_lr.fit(X_train, y_train)

# Predict using the Logistic Regression classifier
y_pred_lr = clf_lr.predict(X_test)

# Calculate Metrics
# accuracy = accuracy_score(y_test, y_pred_lr)
f1 = f1_score(y_test, y_pred_lr, average='weighted')
recall = recall_score(y_test, y_pred_lr, average='weighted')
precision = precision_score(y_test, y_pred_lr, average='weighted')

# Print Metrics
print("Metrics for Logistic Regression Classifier:")
# print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Metrics for Logistic Regression Classifier:
F1 Score: 0.936586453281677
Recall: 0.9369158878504673
Precision: 0.9362984203652466


### Support Vector Machines (SVM) classifier

In [6]:
# Prepare the Data
existing_df_3 = pd.read_csv("Datasets/Existing_Companies.csv", encoding='unicode_escape')
existing_unicorn_data = existing_df_3[['Country', 'Industry']]

# Encoding Country
enc_country = OneHotEncoder()
country_encoded = enc_country.fit_transform(existing_unicorn_data[['Country']]).toarray()
country_encoded_df = pd.DataFrame(country_encoded, columns=enc_country.get_feature_names_out(['Country']))

# Encoding Industry
enc_industry = OneHotEncoder()
industry_encoded = enc_industry.fit_transform(existing_unicorn_data[['Industry']]).toarray()
industry_encoded_df = pd.DataFrame(industry_encoded, columns=enc_industry.get_feature_names_out(['Industry']))

# Combine encoded country and industry into one dataset
existing_unicorn_data = pd.concat([country_encoded_df, industry_encoded_df], axis=1)

# Analyze the distribution of industries and countries
industry_distribution = existing_unicorn_data.filter(like='Industry_').sum().to_dict()
country_distribution = existing_unicorn_data.filter(like='Country_').sum().to_dict()

# Define industry_country_counts
industry_country_counts = {}
for industry in industry_distribution:
    industry_country_counts[industry] = {}
    for country in country_distribution:
        industry_country_counts[industry][country] = existing_unicorn_data[(existing_unicorn_data[industry] == 1) & (existing_unicorn_data[country] == 1)].shape[0]

# Define a function to assign likelihood labels dynamically based on distribution
def assign_likelihood_dynamic(row, industry_country_counts):
    likelihood = 0  # Default: low chance

    # Check each combination of industry and country
    for industry, country_count_dict in industry_country_counts.items():
        if row[industry] == 1:
            for country, count in country_count_dict.items():
                if row[country] == 1:
                    if count > 35:  # Example threshold for high chance
                        likelihood = 2  # High chance
                    elif count > 10:  # Example threshold for medium chance
                        likelihood = 1  # Medium chance
                    return likelihood  # If combination found, return likelihood

    return likelihood

# Apply the function to assign likelihood labels
existing_unicorn_data['Likelihood'] = existing_unicorn_data.apply(assign_likelihood_dynamic, axis=1, industry_country_counts=industry_country_counts)

# Train the Models
X = existing_unicorn_data.drop('Likelihood', axis=1)
y = existing_unicorn_data['Likelihood']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Support Vector Machines (SVM) classifier
clf_svm = SVC(kernel='linear', random_state=42)
clf_svm.fit(X_train, y_train)

# Predict using the Support Vector Machines (SVM) classifier
y_pred_svm = clf_svm.predict(X_test)

# Calculate Metrics
# accuracy = accuracy_score(y_test, y_pred_svm)
f1 = f1_score(y_test, y_pred_svm, average='weighted')
recall = recall_score(y_test, y_pred_svm, average='weighted')
precision = precision_score(y_test, y_pred_svm, average='weighted')

# Print Metrics
print("Metrics for Support Vector Machines (SVM) classifier:")
# print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Metrics for Support Vector Machines (SVM) classifier:
F1 Score: 0.9489401991326125
Recall: 0.9485981308411215
Precision: 0.9550031417682909


### Cross Validation (n_splits) = 5

In [12]:
# Prepare the Data
existing_df_3 = pd.read_csv("Datasets/Existing_Companies.csv", encoding='unicode_escape')
existing_unicorn_data = existing_df_3[['Country', 'Industry']]

# Encoding Country
enc_country = OneHotEncoder()
country_encoded = enc_country.fit_transform(existing_unicorn_data[['Country']]).toarray()
country_encoded_df = pd.DataFrame(country_encoded, columns=enc_country.get_feature_names_out(['Country']))

# Encoding Industry
enc_industry = OneHotEncoder()
industry_encoded = enc_industry.fit_transform(existing_unicorn_data[['Industry']]).toarray()
industry_encoded_df = pd.DataFrame(industry_encoded, columns=enc_industry.get_feature_names_out(['Industry']))

# Combine encoded country and industry into one dataset
existing_unicorn_data = pd.concat([country_encoded_df, industry_encoded_df], axis=1)

# Analyze the distribution of industries and countries
industry_distribution = existing_unicorn_data.filter(like='Industry_').sum().to_dict()
country_distribution = existing_unicorn_data.filter(like='Country_').sum().to_dict()

# Define industry_country_counts
industry_country_counts = {}
for industry in industry_distribution:
    industry_country_counts[industry] = {}
    for country in country_distribution:
        industry_country_counts[industry][country] = existing_unicorn_data[(existing_unicorn_data[industry] == 1) & (existing_unicorn_data[country] == 1)].shape[0]

# Define a function to assign likelihood labels dynamically based on distribution
def assign_likelihood_dynamic(row, industry_country_counts):
    likelihood = 0  # Default: low chance

    # Check each combination of industry and country
    for industry, country_count_dict in industry_country_counts.items():
        if row[industry] == 1:
            for country, count in country_count_dict.items():
                if row[country] == 1:
                    if count > 35:  # Example threshold for high chance
                        likelihood = 2  # High chance
                    elif count > 10:  # Example threshold for medium chance
                        likelihood = 1  # Medium chance
                    return likelihood  # If combination found, return likelihood

    return likelihood

# Apply the function to assign likelihood labels
existing_unicorn_data['Likelihood'] = existing_unicorn_data.apply(assign_likelihood_dynamic, axis=1, industry_country_counts=industry_country_counts)

# Train the Models
X = existing_unicorn_data.drop('Likelihood', axis=1)
y = existing_unicorn_data['Likelihood']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the classifier
clf_lr = LogisticRegression()
clf_lr.fit(X_train, y_train)

clf_svm = SVC()
clf_svm.fit(X_train, y_train)

# Define the cross-validation method
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and evaluate the model
cv_scores_lr = cross_val_score(clf_lr, X_train, y_train, cv=kf, scoring='accuracy')
cv_scores_svm = cross_val_score(clf_svm, X_train, y_train, cv=kf, scoring='accuracy')

# Print the average accuracy across all folds
print("Cross-Validation Accuracy Logistict Regression:", cv_scores_lr.mean())
print("Cross-Validation Accuracy SVM:", cv_scores_svm.mean())

Cross-Validation Accuracy Logistict Regression: 0.9287889447236182
Cross-Validation Accuracy SVM: 0.9979949748743719


### Cross Validation (n_splits) = 10

In [13]:
# Apply the function to assign likelihood labels
existing_unicorn_data['Likelihood'] = existing_unicorn_data.apply(assign_likelihood_dynamic, axis=1, industry_country_counts=industry_country_counts)

# Train the Models
X = existing_unicorn_data.drop('Likelihood', axis=1)
y = existing_unicorn_data['Likelihood']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the classifier
clf_lr = LogisticRegression()
clf_lr.fit(X_train, y_train)

clf_svm = SVC()
clf_svm.fit(X_train, y_train)

# Define the cross-validation method
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation and evaluate the model
cv_scores_lr = cross_val_score(clf_lr, X_train, y_train, cv=kf, scoring='accuracy')
cv_scores_svm = cross_val_score(clf_svm, X_train, y_train, cv=kf, scoring='accuracy')

# Print the average accuracy across all folds
print("Cross-Validation Accuracy Logistict Regression:", cv_scores_lr.mean())
print("Cross-Validation Accuracy SVM:", cv_scores_svm.mean())

Cross-Validation Accuracy Logistict Regression: 0.9288080808080809
Cross-Validation Accuracy SVM: 0.998
