## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Prepare the Data
existing_unicorn_data = pd.read_csv('Datasets/Existing_Companies.csv', encoding='unicode_escape')
existing_unicorn_data = existing_unicorn_data[['Country', 'Industry']]

# Perform encoding on categorical variables (one-hot encoding)
existing_unicorn_data = pd.get_dummies(existing_unicorn_data, columns=['Industry', 'Country'])

# Analyze the distribution of industries and countries
industry_distribution = existing_unicorn_data.filter(like='Industry_').sum().to_dict()
country_distribution = existing_unicorn_data.filter(like='Country_').sum().to_dict()

# Define industry_country_counts
industry_country_counts = {}
for industry in industry_distribution:
    industry_country_counts[industry] = {}
    for country in country_distribution:
        industry_country_counts[industry][country] = existing_unicorn_data[(existing_unicorn_data[industry] == 1) & (existing_unicorn_data[country] == 1)].shape[0]

# Define a function to assign likelihood labels dynamically based on distribution
def assign_likelihood_dynamic(row, industry_country_counts):
    likelihood = 0  # Default: low chance

    # Check each combination of industry and country
    for industry, country_count_dict in industry_country_counts.items():
        if row[industry] == 1:
            for country, count in country_count_dict.items():
                if row[country] == 1:
                    if count > 35:  # Example threshold for high chance
                        likelihood = 2  # High chance
                    elif count > 10:  # Example threshold for medium chance
                        likelihood = 1  # Medium chance
                    return likelihood  # If combination found, return likelihood

    return likelihood

# Apply the function to assign likelihood labels
existing_unicorn_data['Likelihood'] = existing_unicorn_data.apply(assign_likelihood_dynamic, axis=1, industry_country_counts=industry_country_counts)

# Train the Models
X = existing_unicorn_data.drop('Likelihood', axis=1)
y = existing_unicorn_data['Likelihood']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Logistic Regression Classifier

In [3]:
# Train a Logistic Regression classifier
clf_lr = LogisticRegression(random_state=42)
clf_lr.fit(X_train, y_train)

# Predict using the Logistic Regression classifier
y_pred_lr = clf_lr.predict(X_test)

# Calculate Metrics
# accuracy = accuracy_score(y_test, y_pred_lr)
f1 = f1_score(y_test, y_pred_lr, average='weighted')
recall = recall_score(y_test, y_pred_lr, average='weighted')
precision = precision_score(y_test, y_pred_lr, average='weighted')

# Print Metrics
print("Metrics for Logistic Regression Classifier:")
# print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Metrics for Logistic Regression Classifier:
F1 Score: 0.936586453281677
Recall: 0.9369158878504673
Precision: 0.9362984203652466


## Support Vector Machine

### Support Vector Classifier

In [4]:
# Train a Support Vector Machines (SVM) classifier
clf_svm = SVC(random_state=42)
clf_svm.fit(X_train, y_train)

# Predict using the Support Vector Machines (SVM) classifier
y_pred_svm = clf_svm.predict(X_test)

# Calculate Metrics
# accuracy = accuracy_score(y_test, y_pred_svm)
f1 = f1_score(y_test, y_pred_svm, average='weighted')
recall = recall_score(y_test, y_pred_svm, average='weighted')
precision = precision_score(y_test, y_pred_svm, average='weighted')

# Print Metrics
print("Metrics for Support Vector Machines (SVM) classifier:")
# print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Metrics for Support Vector Machines (SVM) classifier:
F1 Score: 1.0
Recall: 1.0
Precision: 1.0


### Support Vector Classifier Linear

In [5]:
# Train a Support Vector Machines (SVM) classifier
clf_svm_li = SVC(kernel='linear', random_state=42)
clf_svm_li.fit(X_train, y_train)

# Predict using the Support Vector Machines (SVM) classifier
y_pred_svm_li = clf_svm_li.predict(X_test)

# Calculate Metrics
# accuracy = accuracy_score(y_test, y_pred_svm)
f1 = f1_score(y_test, y_pred_svm_li, average='weighted')
recall = recall_score(y_test, y_pred_svm_li, average='weighted')
precision = precision_score(y_test, y_pred_svm_li, average='weighted')

# Print Metrics
print("Metrics for Support Vector Machines (SVM) classifier Linear:")
# print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Metrics for Support Vector Machines (SVM) classifier Linear:
F1 Score: 0.9489401991326125
Recall: 0.9485981308411215
Precision: 0.9550031417682909


## k-Nearest Neighbors (KNN) classifier

In [6]:
# Train a k-Nearest Neighbors classifier
clf_knn = KNeighborsClassifier(n_neighbors=5)  # Adjust the number of neighbors as needed
clf_knn.fit(X_train, y_train)

# Predict using the k-Nearest Neighbors classifier
y_pred_knn = clf_knn.predict(X_test)

# Calculate Metrics
f1_knn = f1_score(y_test, y_pred_knn, average='weighted')
recall_knn = recall_score(y_test, y_pred_knn, average='weighted')
precision_knn = precision_score(y_test, y_pred_knn, average='weighted')

# Print Metrics
print("Metrics for k-Nearest Neighbors Classifier:")
print("F1 Score:", f1_knn)
print("Recall:", recall_knn)
print("Precision:", precision_knn)

Metrics for k-Nearest Neighbors Classifier:
F1 Score: 0.9284045316623573
Recall: 0.9322429906542056
Precision: 0.9378855335233616


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
