In [None]:
!pip install ucimlrepo

In [None]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Step 1: Fetch the dataset
drug_reviews_drugs_com = fetch_ucirepo(id=462)

In [None]:
# Step 2: Extract features and inspect the dataset
X = drug_reviews_drugs_com.data.features
print("Features DataFrame:\n", X.head())

# Inspect columns to find the target variable
print("Columns in the dataset:\n", X.columns)

# Assuming 'rating' or a similar column is the target variable based on common knowledge of the dataset
target_column = 'rating'  # Adjust if the actual target is different
y = X[target_column]

# Drop the target from the features
X = X.drop(columns=[target_column])

# Print metadata and variable information
print(drug_reviews_drugs_com.metadata)
print(drug_reviews_drugs_com.variables)

In [None]:
# Step 3: Preprocess the data
# Handle text data: Let's assume there is a text column 'review' to be vectorized using TF-IDF
textual_data_column = 'review'  # Adjust this if the actual text column has a different name

# Text Vectorization using TF-IDF
if textual_data_column in X.columns:
    tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
    X_text = tfidf.fit_transform(X[textual_data_column]).toarray()
    X_text_df = pd.DataFrame(X_text, columns=tfidf.get_feature_names_out())
    X = X.drop(columns=[textual_data_column])
    X = pd.concat([X, X_text_df], axis=1)

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Visualize distribution of the target variable
plt.figure(figsize=(8, 6))
sns.histplot(y, kde=True, bins=20)
plt.title("Target Variable Distribution")
plt.show()

In [5]:
# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Helper function to create a model pipeline
def build_model(model):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize features
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Polynomial features
        ('regressor', model)  # Regression model
    ])
    return pipeline

# Initialize models
models = {
    "Lasso Regression": Lasso(alpha=0.1),
    "ElasticNet Regression": ElasticNet(alpha=0.1, l1_ratio=0.5)
}

In [None]:
# Step 5: Train models and evaluate metrics
for name, model in models.items():
    # Train the model
    pipeline = build_model(model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    cross_val_r2 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2').mean()

    # Residuals Plot
    residuals = y_test - y_pred
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=y_pred, y=residuals)
    plt.hlines(y=0, xmin=min(y_pred), xmax=max(y_pred), colors='red', linestyles='dashed')
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title(f"Residuals Plot for {name}")
    plt.show()

    # Prediction vs Actual Plot
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=y_test, y=y_pred)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', lw=2)
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")
    plt.title(f"Actual vs Predicted Values for {name}")
    plt.show()

    # Learning Curve Plot
    train_sizes, train_scores, test_scores = learning_curve(
        pipeline, X_train, y_train, cv=5, scoring='r2', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 50))

    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_scores_mean, 'o-', color='blue', label='Training score')
    plt.plot(train_sizes, test_scores_mean, 'o-', color='red', label='Cross-validation score')
    plt.title(f"Learning Curve for {name}")
    plt.xlabel("Training Set Size")
    plt.ylabel("R^2 Score")
    plt.legend(loc="best")
    plt.show()

In [None]:
# Print metrics
print(f"{name} Metrics:")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")
print(f"Cross-Validation R^2 Score: {cross_val_r2}")
print("-" * 40)