In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Prepare the Data
existing_unicorn_data = pd.read_csv('Datasets/Existing_Companies.csv', encoding='unicode_escape')
existing_unicorn_data = existing_unicorn_data[['Country', 'Industry']]

# Perform encoding on categorical variables (one-hot encoding)
existing_unicorn_data = pd.get_dummies(existing_unicorn_data, columns=['Industry', 'Country'])

# Analyze the distribution of industries and countries
industry_distribution = existing_unicorn_data.filter(like='Industry_').sum().to_dict()
country_distribution = existing_unicorn_data.filter(like='Country_').sum().to_dict()

# Define industry_country_counts
industry_country_counts = {}
for industry in industry_distribution:
    industry_country_counts[industry] = {}
    for country in country_distribution:
        industry_country_counts[industry][country] = existing_unicorn_data[(existing_unicorn_data[industry] == 1) & (existing_unicorn_data[country] == 1)].shape[0]

# Define a function to assign likelihood labels dynamically based on distribution
def assign_likelihood_dynamic(row, industry_country_counts):
    likelihood = 0  # Default: low chance

    # Check each combination of industry and country
    for industry, country_count_dict in industry_country_counts.items():
        if row[industry] == 1:
            for country, count in country_count_dict.items():
                if row[country] == 1:
                    if count > 35:  # Example threshold for high chance
                        likelihood = 2  # High chance
                    elif count > 10:  # Example threshold for medium chance
                        likelihood = 1  # Medium chance
                    return likelihood  # If combination found, return likelihood

    return likelihood

# Apply the function to assign likelihood labels
existing_unicorn_data['Likelihood'] = existing_unicorn_data.apply(assign_likelihood_dynamic, axis=1, industry_country_counts=industry_country_counts)

# Train the Models
X = existing_unicorn_data.drop('Likelihood', axis=1)
y = existing_unicorn_data['Likelihood']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
# Initialize the classifier
clf_lr = LogisticRegression()
clf_lr.fit(X_train, y_train)

clf_svm = SVC()
clf_svm.fit(X_train, y_train)

clf_svm_li = SVC(kernel='linear')
clf_svm_li.fit(X_train, y_train)

clf_knn = KNeighborsClassifier(n_neighbors=5)  
clf_knn.fit(X_train, y_train)

KNeighborsClassifier()

### Cross Validation (n_splits) = 5

In [4]:
# Define the cross-validation method
kf_5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and evaluate the model
cv_scores_lr = cross_val_score(clf_lr, X_train, y_train, cv=kf_5, scoring='f1_macro')
cv_scores_svm = cross_val_score(clf_svm, X_train, y_train, cv=kf_5, scoring='f1_macro')
cv_scores_svm_li = cross_val_score(clf_svm_li, X_train, y_train, cv=kf_5, scoring='f1_macro')
cv_scores_knn = cross_val_score(clf_knn, X_train, y_train, cv=kf_5, scoring='f1_macro')

# Print the average F1 score across all folds
print("Cross-Validation F1 Score Logistic Regression:", cv_scores_lr.mean())
print("Cross-Validation F1 Score SVM:", cv_scores_svm.mean())
print("Cross-Validation F1 Score SVM Linear:", cv_scores_svm_li.mean())
print("Cross-Validation F1 Score KNN:", cv_scores_knn.mean())

Cross-Validation F1 Score Logistic Regression: 0.8652311195640578
Cross-Validation F1 Score SVM: 0.9960881474754949
Cross-Validation F1 Score SVM Linear: 0.9142263953228313
Cross-Validation F1 Score KNN: 0.8666273391245285


### Cross Validation (n_splits) = 10

In [5]:
# Define the cross-validation method
kf_10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation and evaluate the model
cv_scores_lr = cross_val_score(clf_lr, X_train, y_train, cv=kf_10, scoring='f1_macro')
cv_scores_svm = cross_val_score(clf_svm, X_train, y_train, cv=kf_10, scoring='f1_macro')
cv_scores_svm_li = cross_val_score(clf_svm_li, X_train, y_train, cv=kf_10, scoring='f1_macro')
cv_scores_knn = cross_val_score(clf_knn, X_train, y_train, cv=kf_10, scoring='f1_macro')

# Print the average F1 score across all folds
print("Cross-Validation F1 Score Logistic Regression:", cv_scores_lr.mean())
print("Cross-Validation F1 Score SVM:", cv_scores_svm.mean())
print("Cross-Validation F1 Score SVM Linear:", cv_scores_svm_li.mean())
print("Cross-Validation F1 Score KNN:", cv_scores_knn.mean())

Cross-Validation F1 Score Logistic Regression: 0.8646335294028413
Cross-Validation F1 Score SVM: 0.996170319979844
Cross-Validation F1 Score SVM Linear: 0.9157914025663663
Cross-Validation F1 Score KNN: 0.8817399287320754


### Cross Validation (n_splits) = 15

In [6]:
# Define the cross-validation method
kf_15 = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)

# Perform cross-validation and evaluate the model
cv_scores_lr = cross_val_score(clf_lr, X_train, y_train, cv=kf_15, scoring='f1_macro')
cv_scores_svm = cross_val_score(clf_svm, X_train, y_train, cv=kf_15, scoring='f1_macro')
cv_scores_svm_li = cross_val_score(clf_svm_li, X_train, y_train, cv=kf_15, scoring='f1_macro')
cv_scores_knn = cross_val_score(clf_knn, X_train, y_train, cv=kf_15, scoring='f1_macro')

# Print the average F1 score across all folds
print("Cross-Validation F1 Score Logistic Regression:", cv_scores_lr.mean())
print("Cross-Validation F1 Score SVM:", cv_scores_svm.mean())
print("Cross-Validation F1 Score SVM Linear:", cv_scores_svm_li.mean())
print("Cross-Validation F1 Score KNN:", cv_scores_knn.mean())

Cross-Validation F1 Score Logistic Regression: 0.8686238677283377
Cross-Validation F1 Score SVM: 0.9961329732627047
Cross-Validation F1 Score SVM Linear: 0.9133839987369335
Cross-Validation F1 Score KNN: 0.8717036873858943
