In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Prepare the Data
existing_unicorn_data = pd.read_csv('existing_unicorn_data.csv')

# Perform encoding on categorical variables (one-hot encoding)
existing_unicorn_data = pd.get_dummies(existing_unicorn_data, columns=['Industry', 'Country'])

# Analyze the distribution of industries and countries
industry_distribution = existing_unicorn_data.filter(like='Industry_').sum().to_dict()
country_distribution = existing_unicorn_data.filter(like='Country_').sum().to_dict()

# Define a function to assign likelihood labels dynamically based on distribution
def assign_likelihood_dynamic(row):
    likelihood = 0  # Default: low chance

    # Assign likelihood based on industry
    for industry, count in industry_distribution.items():
        if row[industry] == 1:
            if count > 40:  # Example threshold for high chance
                likelihood = 2  # High chance
            elif count > 20:  # Example threshold for medium chance
                likelihood = 1  # Medium chance
            break  # If industry found, break loop

    # Assign likelihood based on country
    for country, count in country_distribution.items():
        if row[country] == 1:
            if count > 80:  # Example threshold for high chance
                likelihood = 2  # High chance
            elif count > 30:  # Example threshold for medium chance
                likelihood = 1  # Medium chance
            break  # If country found, break loop

    return likelihood

# Apply the function to assign likelihood labels
existing_unicorn_data['Likelihood'] = existing_unicorn_data.apply(assign_likelihood_dynamic, axis=1)

# Train the Model
X = existing_unicorn_data.drop('Likelihood', axis=1)
y = existing_unicorn_data['Likelihood']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate the Model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.92      0.96        36
           2       0.99      1.00      0.99       249

    accuracy                           0.99       285
   macro avg       0.99      0.96      0.98       285
weighted avg       0.99      0.99      0.99       285



In [16]:
# Step 5: Predict on New Data

# Assuming you have new data in a DataFrame called 'new_data'
new_data = pd.read_csv('existing_unicorn_data.csv')
# Perform the same preprocessing steps as above
new_data_encoded = pd.get_dummies(new_data, columns=['Industry', 'Country'])

# Ensure the new_data_encoded DataFrame has the same columns as X_train
# Add missing columns with zeros if necessary
missing_cols = set(X_train.columns) - set(new_data_encoded.columns)
for col in missing_cols:
    new_data_encoded[col] = 0

# Make predictions
predictions = clf.predict(new_data_encoded)

# Print the predictions
predictions


array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [19]:
import joblib

# Step 6: Export the Model
# Save the trained model to a file
joblib.dump(clf, 'unicorn_classifier_model.pkl')

# Later, you can load the model using:
# loaded_model = joblib.load('unicorn_classifier_model.pkl')



['unicorn_classifier_model.pkl']

In [20]:
import pandas as pd
import joblib

# Load the trained model
loaded_model = joblib.load('unicorn_classifier_model.pkl')

# Assuming you have new data in a DataFrame called 'new_data'
# Apply the same preprocessing steps as in the training data
# For example:
# new_data = pd.read_csv('new_data.csv')
# new_data = pd.get_dummies(new_data, columns=['Industry', 'Country'])

# Predict the likelihood of being a unicorn
new_data['Likelihood'] = loaded_model.predict(new_data.drop('Likelihood', axis=1))

# Now new_data will have a new column 'Likelihood' with predicted values
# You can then further process or analyze the predictions as needed



KeyError: "['Likelihood'] not found in axis"

In [27]:
# Specify the file path where you want to save the CSV file
csv_file_path = 'Likelihood.csv'

# Export the dataframe to a CSV file
existing_unicorn_data.to_csv(csv_file_path, index=False)

print(f"Dataframe has been exported to {csv_file_path}.")

Dataframe has been exported to Likelihood.csv.


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Prepare the Data
existing_unicorn_data = pd.read_csv('existing_unicorn_data.csv')

# Perform encoding on categorical variables (one-hot encoding)
existing_unicorn_data = pd.get_dummies(existing_unicorn_data, columns=['Industry', 'Country'])

# Analyze the distribution of industries and countries
industry_distribution = existing_unicorn_data.filter(like='Industry_').sum().to_dict()
country_distribution = existing_unicorn_data.filter(like='Country_').sum().to_dict()

# Define a function to assign likelihood labels dynamically based on distribution
def assign_likelihood_dynamic(row):
    likelihood = 0  # Default: low chance

    # Assign likelihood based on industry
    for industry, count in industry_distribution.items():
        if row[industry] == 1:
            if count > 40:  # Example threshold for high chance
                likelihood = 2  # High chance
            elif count > 20:  # Example threshold for medium chance
                likelihood = 1  # Medium chance
            break  # If industry found, break loop

    # Assign likelihood based on country
    for country, count in country_distribution.items():
        if row[country] == 1:
            if count > 80:  # Example threshold for high chance
                likelihood = 2  # High chance
            elif count > 30:  # Example threshold for medium chance
                likelihood = 1  # Medium chance
            break  # If country found, break loop

    return likelihood

# Apply the function to assign likelihood labels
existing_unicorn_data['Likelihood'] = existing_unicorn_data.apply(assign_likelihood_dynamic, axis=1)

# Train the Models
X = existing_unicorn_data.drop('Likelihood', axis=1)
y = existing_unicorn_data['Likelihood']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train, y_train)

# Train a Gradient Boosting classifier
clf_gb = GradientBoostingClassifier(random_state=42)
clf_gb.fit(X_train, y_train)

# Train a Logistic Regression classifier
clf_lr = LogisticRegression(random_state=42)
clf_lr.fit(X_train, y_train)

# Train a Support Vector Machines (SVM) classifier
clf_svm = SVC(kernel='linear', random_state=42)
clf_svm.fit(X_train, y_train)

# Evaluate the Models
print("Random Forest Classifier:")
y_pred_rf = clf_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

print("Gradient Boosting Classifier:")
y_pred_gb = clf_gb.predict(X_test)
print(classification_report(y_test, y_pred_gb))

print("Logistic Regression Classifier:")
y_pred_lr = clf_lr.predict(X_test)
print(classification_report(y_test, y_pred_lr))

print("Support Vector Machines (SVM) Classifier:")
y_pred_svm = clf_svm.predict(X_test)
print(classification_report(y_test, y_pred_svm))


Random Forest Classifier:
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        36
           2       0.99      1.00      0.99       249

    accuracy                           0.99       285
   macro avg       0.99      0.96      0.98       285
weighted avg       0.99      0.99      0.99       285

Gradient Boosting Classifier:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.92      0.96        36
           2       0.99      1.00      0.99       249

    accuracy                           0.99       285
   macro avg       0.66      0.64      0.65       285
weighted avg       0.99      0.99      0.99       285

Logistic Regression Classifier:
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        36
           2       0.99      1.00      0.99       249

    accuracy                           0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
