Importing Necessary Libraries
The libraries are imported for data manipulation (pandas, numpy), plotting and visualization (matplotlib, seaborn), machine learning (sklearn), and performance evaluation (accuracy_score, classification_report).

In [None]:
# Import necessary libraries for data processing, machine learning, and evaluation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler


Loading data
The dataset is loaded into the data variable using pandas.read_csv() for subsequent analysis and model training.




In [None]:
enhanced_dataset_path = '/content/Drug_overdose_death_rates_by_drug_typesexageraceand_Hispanic_origin_United_States.csv'
enhanced_dataset = pd.read_csv(enhanced_dataset_path)

print("First few rows of the enhanced dataset:")
print(enhanced_dataset.head())

Checking for missing values and displaying the missing values


In [None]:
# Check for missing values in the dataset
missing_values = enhanced_dataset.isnull().sum()

# Display the count of missing values per column
print("Missing values per column:")
print(missing_values)

Handling the missing values in the 'ESTIMATE' and 'Unemployment Rate' columns with their respective median values to ensure no missing data affects model training. The lines for 'drug_involved' and 'death_month' are commented out as those columns are not present in the dataset. Finally, the code rechecks and prints the missing values in the dataset after handling the missing data.

In [None]:
# Fill missing values in the 'ESTIMATE' column with the median (important for target variable)
enhanced_dataset['ESTIMATE'] = enhanced_dataset['ESTIMATE'].fillna(enhanced_dataset['ESTIMATE'].median())

# Fill missing values in 'Unemployment Rate' with the median
enhanced_dataset['Unemployment Rate'] = enhanced_dataset['Unemployment Rate'].fillna(enhanced_dataset['Unemployment Rate'].median())

# The following columns were not found in the dataset, so the lines are commented out or removed.
# enhased_dataset['drug_involved'] = enhanced_dataset['drug_involved'].fillna('Unknown')
# enhanced_dataset['death_month'] = enhanced_dataset['death_month'].fillna(enhanced_dataset['death_month'].mode()[0])  # Fill with most frequent value

# Recheck missing values after filling
missing_values_after = enhanced_dataset.isnull().sum()
print("\nMissing values after handling:")
print(missing_values_after)

The 'Unemployment-to-Overdose Ratio' is created by dividing Unemployment Rate by Overdose Death Rate (target variable).

Infinite values (e.g., from division by zero) in the new ratio are replaced with NaN and then filled with the median of the column.

Finally, the enhanced dataset with the new feature is displayed, and missing values are rechecked to ensure proper data handling.

In [None]:
# Create 'Unemployment-to-Overdose Ratio'
enhanced_dataset['Unemployment_to_Overdose_Ratio'] = enhanced_dataset['Unemployment Rate'] / enhanced_dataset['ESTIMATE']

# The 'drug_involved' column was not found in the dataset, so the line is commented out.
# enhanced_dataset['drug_involved_category'] = enhanced_dataset['drug_involved'].astype('category').cat.codes

# Handle infinite values (e.g., from division by zero) by replacing them with NaN
# And then fill NaN values with the median of the respective column
import numpy as np
for col in ['Unemployment_to_Overdose_Ratio']:
    enhanced_dataset[col] = enhanced_dataset[col].replace([np.inf, -np.inf], np.nan)
    enhanced_dataset[col] = enhanced_dataset[col].fillna(enhanced_dataset[col].median())

# Check the enhanced dataset with new features
print("\nEnhanced Dataset with New Features:")
# Removed 'drug_involved_category' as the column was not found
print(enhanced_dataset[['Unemployment_to_Overdose_Ratio']].head())

# Recheck for any remaining missing values in these columns after handling inf/NaN
print("\nMissing values in ratio columns after handling inf/NaN:")
print(enhanced_dataset[['Unemployment_to_Overdose_Ratio']].isnull().sum())

This code creates a scatter plot to visualize the relationship between Unemployment Rate and Overdose Death Rates in the dataset.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot for Unemployment Rate vs Overdose Death Rates
plt.figure(figsize=(8,5))
sns.scatterplot(x=enhanced_dataset["Unemployment Rate"], y=enhanced_dataset["ESTIMATE"], color='blue')
plt.title("Unemployment Rate vs Overdose Death Rates")
plt.xlabel("Unemployment Rate (%)")
plt.ylabel("Drug Overdose Death Rate (per 100,000 population)")
plt.show()

This code generates a heatmap of the correlation matrix to visualize the relationships between different numerical features in the dataset.

In [None]:
# Plot heatmap of correlation matrix
plt.figure(figsize=(10,8))
sns.heatmap(enhanced_dataset.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Features")
plt.show()

This code defines the features (X) and target variable (y) for the machine learning model and then displays them.

In [None]:
# Define Features (X) and Target (y)
X = enhanced_dataset.drop(columns=['ESTIMATE', 'YEAR', 'PANEL', 'STUB_NAME', 'STUB_LABEL', 'INDICATOR', 'UNIT', 'AGE'])  # Drop irrelevant and non-numeric columns and 'drug_involved'
y = enhanced_dataset['ESTIMATE']  # Target variable (overdose death rate)

# Display the feature set and target variable
print("Features (X):", X.columns)
print("Target (y):"), y.name

This code splits the dataset into training and testing sets, which are necessary steps in preparing data for machine learning model training and evaluation.

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the resulting datasets
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

This code aims to train and evaluate multiple ML models to predict drug overdose death rates based on various features, including the 'drug_involved' column, which is one-hot encoded to make it usable by the machine learning models.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import pandas as pd # Import pandas for get_dummies

# Before model training: One-hot encode the 'drug_involved' column in X_train and X_test
X_train = pd.get_dummies(X_train, columns=['drug_involved'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['drug_involved'], drop_first=True)

# Ensure that X_train and X_test have the same columns after one-hot encoding
# This step is crucial if some categories are present in train but not in test, or vice-versa
train_cols = X_train.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0

missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X_train[c] = 0

X_test = X_test[train_cols] # Ensure the order of columns is the same

# Initialize the models
models = {
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'KNN': KNeighborsRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Train and evaluate each model
model_results = {}

for name, model in models.items():
    model.fit(X_train, y_train)  # Fit the model to the training data
    y_pred = model.predict(X_test)  # Predict on the test data

    # Evaluate performance for regression models (MSE and R2)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    model_results[name] = {'MSE': mse, 'R2': r2}

# Display the evaluation metrics for each model
print("Model Performance Results:")
for model_name, metrics in model_results.items():
    print(f"{model_name}: MSE = {metrics['MSE']}, R² = {metrics['R2']}")


This code is designed to perform binary classification on the drug overdose death rates dataset to predict whether the overdose death rate is "High" or "Low". Several classification models are trained, evaluated, and compared based on performance metrics such as precision, recall, F1-score, and accuracy.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Load the dataset (assuming it's already loaded as enhanced_dataset)
# dataset_path = '/content/Drug_overdose_death_rates_by_drug_typesexageraceand_Hispanic_origin_United_States.csv'
# dataset = pd.read_csv(dataset_path)

# Define the target variable and features (using the already processed enhanced_dataset)
X_classification = enhanced_dataset.drop(columns=['ESTIMATE', 'YEAR', 'PANEL', 'STUB_NAME', 'STUB_LABEL', 'INDICATOR', 'UNIT', 'AGE'])
y_classification = enhanced_dataset['ESTIMATE']

# Convert the target variable to binary classification ('High' vs 'Low' overdose death rate)
# Example: 'High' if overdose rate > 20, else 'Low'
y_class = np.where(y_classification > 20, "High", "Low")

# Split data into training and testing sets (80% train, 20% test)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_classification, y_class, test_size=0.2, random_state=42)

# One-hot encode the 'drug_involved' column in X_train_cls and X_test_cls
X_train_cls = pd.get_dummies(X_train_cls, columns=['drug_involved'], drop_first=True)
X_test_cls = pd.get_dummies(X_test_cls, columns=['drug_involved'], drop_first=True)

# Ensure that X_train_cls and X_test_cls have the same columns after one-hot encoding
# This step is crucial if some categories are present in train but not in test, or vice-versa
train_cls_cols = X_train_cls.columns
test_cls_cols = X_test_cls.columns

missing_in_test_cls = set(train_cls_cols) - set(test_cls_cols)
for c in missing_in_test_cls:
    X_test_cls[c] = 0

missing_in_train_cls = set(test_cls_cols) - set(train_cls_cols)
for c in missing_in_train_cls:
    X_train_cls[c] = 0

X_test_cls = X_test_cls[train_cls_cols] # Ensure the order of columns is the same

# Initialize the models
models_cls = {
    'Logistic Regression': LogisticRegression(solver='liblinear'), # Specify solver for LogisticRegression
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Store the results for each model
model_results_cls = {}

# Train and evaluate each model
for name, model in models_cls.items():
    # Train the model
    model.fit(X_train_cls, y_train_cls)

    # Predict on the test set
    y_pred_cls = model.predict(X_test_cls)

    # Calculate evaluation metrics
    precision = precision_score(y_test_cls, y_pred_cls, average='binary', pos_label='High')
    recall = recall_score(y_test_cls, y_pred_cls, average='binary', pos_label='High')
    f1 = f1_score(y_test_cls, y_pred_cls, average='binary', pos_label='High')
    accuracy = accuracy_score(y_test_cls, y_pred_cls)

    # Store the results
    model_results_cls[name] = {
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Accuracy': accuracy
    }

# Display the evaluation metrics for each model
print("Model Performance Results (Classification):")
for model_name, metrics in model_results_cls.items():
    print(f"\n{model_name}:")
    print(f"  Precision: {metrics['Precision']:.2f}")
    print(f"  Recall: {metrics['Recall']:.2f}")
    print(f"  F1-Score: {metrics['F1-Score']:.2f}")
    print(f"  Accuracy: {metrics['Accuracy']:.2f}")


This code performs hyperparameter tuning for the RandomForestRegressor model using GridSearchCV. The goal is to find the best combination of hyperparameters for improving the model's performance.

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='neg_mean_squared_error')

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Display the best parameters found
print(f"Best Parameters for Random Forest: {grid_search.best_params_}")

Creating a bar chart to compare the R² (R-squared) values of different models, visualizing how well each model fits the data based on this evaluation metric.

In [None]:
import matplotlib.pyplot as plt

# Create lists for model names and corresponding R² values
model_names = list(model_results.keys())
r2_scores = [metrics['R2'] for metrics in model_results.values()]

# Plot a bar chart of R² values for model comparison
plt.figure(figsize=(10, 6))
plt.barh(model_names, r2_scores, color='skyblue')
plt.xlabel('R² Score')
plt.title('Model Performance Comparison (R²)')
plt.show()

Generating a set of 4 bar charts to compare the performance of different classification models across 4 different evaluation metrics: Precision, Recall, F1-Score, and

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Prepare data for plotting
metrics_df = pd.DataFrame(model_results_cls).T
metrics_df.index.name = 'Model'

# Create subplots for each metric
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten() # Flatten the 2x2 array of axes for easy iteration

metrics_to_plot = ['Precision', 'Recall', 'F1-Score', 'Accuracy']

# Ensure the palette has enough unique colors for all models
colors = sns.color_palette('viridis', n_colors=len(metrics_df.index))

for i, metric in enumerate(metrics_to_plot):
    # Assign x to hue and set legend=False to address FutureWarning
    sns.barplot(x=metrics_df.index, y=metrics_df[metric], hue=metrics_df.index, ax=axes[i], palette=colors, legend=False)
    axes[i].set_title(f'{metric} Comparison Across Models')
    axes[i].set_ylabel(metric)
    axes[i].set_xlabel('Model')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()