### Evaluating 4 Models

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Prepare the Data
existing_unicorn_data = pd.read_csv('existing_unicorn_data.csv')

# Perform encoding on categorical variables (one-hot encoding)
existing_unicorn_data = pd.get_dummies(existing_unicorn_data, columns=['Industry', 'Country'])

# Analyze the distribution of industries and countries
industry_distribution = existing_unicorn_data.filter(like='Industry_').sum().to_dict()
country_distribution = existing_unicorn_data.filter(like='Country_').sum().to_dict()

# Define industry_country_counts
industry_country_counts = {}
for industry in industry_distribution:
    industry_country_counts[industry] = {}
    for country in country_distribution:
        industry_country_counts[industry][country] = existing_unicorn_data[(existing_unicorn_data[industry] == 1) & (existing_unicorn_data[country] == 1)].shape[0]

# Define a function to assign likelihood labels dynamically based on distribution
def assign_likelihood_dynamic(row, industry_country_counts):
    likelihood = 0  # Default: low chance

    # Check each combination of industry and country
    for industry, country_count_dict in industry_country_counts.items():
        if row[industry] == 1:
            for country, count in country_count_dict.items():
                if row[country] == 1:
                    if count > 35:  # Example threshold for high chance
                        likelihood = 2  # High chance
                    elif count > 15:  # Example threshold for medium chance
                        likelihood = 1  # Medium chance
                    return likelihood  # If combination found, return likelihood

    return likelihood

# Apply the function to assign likelihood labels
existing_unicorn_data['Likelihood'] = existing_unicorn_data.apply(assign_likelihood_dynamic, axis=1, industry_country_counts=industry_country_counts)

# Train the Models
X = existing_unicorn_data.drop('Likelihood', axis=1)
y = existing_unicorn_data['Likelihood']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest classifier
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train, y_train)

# Train a Gradient Boosting classifier
clf_gb = GradientBoostingClassifier(random_state=42)
clf_gb.fit(X_train, y_train)

# Train a Logistic Regression classifier
clf_lr = LogisticRegression(random_state=42)
clf_lr.fit(X_train, y_train)

# Train a Support Vector Machines (SVM) classifier
clf_svm = SVC(kernel='linear', random_state=42)
clf_svm.fit(X_train, y_train)

# Evaluate the Models
print("Random Forest Classifier:")
y_pred_rf = clf_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

print("Gradient Boosting Classifier:")
y_pred_gb = clf_gb.predict(X_test)
print(classification_report(y_test, y_pred_gb))

print("Logistic Regression Classifier:")
y_pred_lr = clf_lr.predict(X_test)
print(classification_report(y_test, y_pred_lr))

print("Support Vector Machines (SVM) Classifier:")
y_pred_svm = clf_svm.predict(X_test)
print(classification_report(y_test, y_pred_svm))

Random Forest Classifier:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       143
           1       1.00      1.00      1.00        41
           2       1.00      1.00      1.00       244

    accuracy                           1.00       428
   macro avg       1.00      1.00      1.00       428
weighted avg       1.00      1.00      1.00       428

Gradient Boosting Classifier:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       143
           1       1.00      1.00      1.00        41
           2       1.00      1.00      1.00       244

    accuracy                           1.00       428
   macro avg       1.00      1.00      1.00       428
weighted avg       1.00      1.00      1.00       428

Logistic Regression Classifier:
              precision    recall  f1-score   support

           0       1.00      0.91      0.95       143
           1       0.84      0.76      0.

In [2]:
# Specify the file path where you want to save the CSV file
csv_file_path = 'Likelihood_new.csv'

# Export the dataframe to a CSV file
existing_unicorn_data.to_csv(csv_file_path, index=False)

print(f"Dataframe has been exported to {csv_file_path}.")

Dataframe has been exported to Likelihood_new.csv.


### Implementing Lazy Predict

In [3]:
! pip install lazypredict



In [4]:
import lazypredict

In [5]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

# Train the Models
X = existing_unicorn_data.drop('Likelihood', axis=1)
y = existing_unicorn_data['Likelihood']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

 97%|█████████▋| 28/29 [00:03<00:00,  8.75it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001042 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 39
[LightGBM] [Info] Number of data points in the train set: 997, number of used features: 13
[LightGBM] [Info] Start training from score -1.184439
[LightGBM] [Info] Start training from score -2.642071
[LightGBM] [Info] Start training from score -0.473420


100%|██████████| 29/29 [00:04<00:00,  6.85it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
XGBClassifier                      1.00               1.00    None      1.00   
ExtraTreeClassifier                1.00               1.00    None      1.00   
ExtraTreesClassifier               1.00               1.00    None      1.00   
RandomForestClassifier             1.00               1.00    None      1.00   
DecisionTreeClassifier             1.00               1.00    None      1.00   
BaggingClassifier                  1.00               1.00    None      1.00   
SVC                                0.98               0.98    None      0.98   
LabelPropagation                   0.98               0.98    None      0.98   
LabelSpreading                     0.98               0.98    None      0.98   
PassiveAggressiveClassifier        0.96               0.96    None      0.96   
KNeighborsClassifier               0.96 


