In [1]:
# %%capture
%pip install cudf-cu11 cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting cudf-cu11
  Downloading cudf_cu11-24.6.1.tar.gz (2.6 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Preparing metadata (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [56 lines of output]
        File "C:\Users\paulo\AppData\Local\Temp\pip-build-env-bu089ex6\overlay\Lib\site-packages\nvidia_stub\wheel.py", line 147, in download_wheel
          return download_manual(wheel_directory, distribution, version)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "C:\Users\paulo\AppData\Local\Temp\pip-build-env-bu089ex6\overlay\Lib\site-packages\nvidia_stub\wheel.py", line 114, in download_manual
          raise RuntimeError(f"Didn't find wheel for {distribution} {version}")
      Traceback (most recent call last):
        File "C:\Users\paulo\AppData\Local\Temp\pip-build-env-bu089ex6\overlay\Lib\site-packages\nvidia_stub\wheel.py", line 147, in download_wheel
          return download_manual(wheel_directory, distribution, version)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [5]:
import cudf
import numpy as np
from cuml.preprocessing import StandardScaler
from cuml.cluster import KMeans
from flaml import AutoML
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load the datasets using cuDF
train_df = cudf.read_csv("train.csv", index_col='id')
test_df = cudf.read_csv("test.csv", index_col='id')

# Transform binary variables
train_df['Gender'] = train_df['Gender'].map({'Male': 1, 'Female': 0})
train_df['Vehicle_Damage'] = train_df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})

# Define binary variables and drop Driving_License due to limited variability
binary = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Response']
train_df = train_df.drop(['Driving_License'], axis=1)

# Group rare categories in categorical variables
def group_rare_categories(df, column, threshold=0.01):
    category_freq = df[column].value_counts(normalize=True)
    rare_categories = category_freq[category_freq < threshold].index
    df[column] = df[column].applymap(lambda x: 'Other' if x in rare_categories else x)
    return df

categorical = ['Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel']
for col in categorical:
    train_df = group_rare_categories(train_df, col, 0.01)

# Handle continuous variables
continuous_numeric = ['Age', 'Vintage', 'Annual_Premium']
Q1 = train_df['Annual_Premium'].quantile(0.25)
Q3 = train_df['Annual_Premium'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
train_df['Outlier_Annual_Premium'] = ((train_df['Annual_Premium'] < lower_bound) | (train_df['Annual_Premium'] > upper_bound)).astype(int)
train_df = train_df[(train_df['Annual_Premium'] >= lower_bound) & (train_df['Annual_Premium'] <= upper_bound)]
train_df = train_df.drop('Outlier_Annual_Premium', axis=1)

# Standardize the continuous variables
scaler = StandardScaler()
scaled_continuous_vars = scaler.fit_transform(train_df[continuous_numeric])

# Apply KMeans clustering
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
clusters = kmeans.fit_predict(scaled_continuous_vars)
train_df['Cluster'] = clusters

# Ordinal Encoding for Vehicle_Age and One-Hot Encoding for other categorical variables
train_df['Vehicle_Age'] = train_df['Vehicle_Age'].map({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2})
train_df = cudf.get_dummies(train_df, columns=categorical, drop_first=True)

# Feature engineering
def feature_engineering(df):
    df['Age_Vehicle_Age'] = df['Age'] * df['Vehicle_Age']
    df['Age_Previously_Insured'] = df['Age'] * df['Previously_Insured']
    df['Vehicle_Age_Damage'] = df['Vehicle_Age'] * df['Vehicle_Damage']
    df['Previously_Insured_Damage'] = df['Previously_Insured'] * df['Vehicle_Damage']
    df['Age_squared'] = df['Age'] ** 2
    df['Vehicle_Age_squared'] = df['Vehicle_Age'] ** 2
    df['Annual_Premium_per_Age'] = df['Annual_Premium'] / (df['Age'] + 1)
    return df

train_df = feature_engineering(train_df)

# Separate features and target variable
X = train_df.drop('Response', axis=1)
y = train_df['Response']

# Convert to pandas DataFrame for FLAML
X_train, X_test, y_train, y_test = train_test_split(X.to_pandas(), y.to_pandas(), test_size=0.2, stratify=y.to_pandas(), random_state=42)

# Initialize AutoML
automl = AutoML()

# Define settings for AutoML
settings = {
    "time_budget": 1800,  # reduce time budget to 30 minutes for efficiency
    "metric": 'roc_auc',  # primary metric
    "task": 'classification',  # task type
    "log_file_name": "automl.log",  # log file
    "seed": 42  # random seed
}

# Train models with AutoML
automl.fit(X_train=X_train, y_train=y_train, **settings)

# Print the best model and its parameters
print(f"Best model: {automl.best_estimator}")
print(f"Best hyperparameters: {automl.best_config}")
print(f"Best ROC-AUC on validation data: {automl.best_loss}")

# Evaluate the best model on the training set
y_train_pred_prob = automl.predict_proba(X_train)[:, 1]
train_roc_auc = roc_auc_score(y_train, y_train_pred_prob)

# Evaluate the best model on the test set
y_test_pred_prob = automl.predict_proba(X_test)[:, 1]
test_roc_auc = roc_auc_score(y_test, y_test_pred_prob)

# Print the train and test ROC-AUC scores
print("Train ROC-AUC Score:", train_roc_auc)
print("Test ROC-AUC Score:", test_roc_auc)

# Plot ROC curve for the best model on the test set
fpr, tpr, _ = roc_curve(y_test, y_test_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Train the best model on the full dataset
best_model = automl.best_estimator
best_model.fit(X_train, y_train)

# Evaluate the retrained best model on the test set
y_test_pred_prob_full = best_model.predict_proba(X_test)[:, 1]
test_roc_auc_full = roc_auc_score(y_test, y_test_pred_prob_full)

# Print the test ROC-AUC score for the retrained best model
print("Test ROC-AUC Score after retraining on full data:", test_roc_auc_full)


ModuleNotFoundError: No module named 'cudf'