<a href="https://colab.research.google.com/github/monahafez-tmu/-EE8230-701E_Machine-Learning-for-Engineers-Project-7/blob/main/Term_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo

# fetch dataset
wine_quality = fetch_ucirepo(id=186)

# data (as pandas dataframes)
X = wine_quality.data.features
y = wine_quality.data.targets

# metadata
print(wine_quality.metadata)

# variable information
print(wine_quality.variables)


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
{'uci_id': 186, 'name': 'Wine Quality', 'repository_url': 'https://archive.ics.uci.edu/dataset/186/wine+quality', 'data_url': 'https://archive.ics.uci.edu/static/public/186/data.csv', 'abstract': 'Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 4898, 'num_features': 11, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['quality'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updat

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

# fetch dataset
wine_quality = fetch_ucirepo(id=186)
# data (as pandas dataframes)
X = wine_quality.data.features
y = wine_quality.data.targets

# metadata
print(wine_quality.metadata)


red_wine_mask = (wine_quality.data.original['color'] == 'red')

X = X[red_wine_mask]
y = y[red_wine_mask]

print(f"Shape of features for red wine (X): {X.shape}")
print(f"Shape of target for red wine (y): {y.shape}")

# --- Splitting the Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Standardize Features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Model Training and Evaluation ---
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(random_state=42),
    "Lasso Regression": Lasso(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
}

results = {}

for name, model in models.items():
    # Train the model
    if name in ["Linear Regression", "Ridge Regression", "Lasso Regression"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {
        "MAE": mae,
        "MSE": mse,
        "R2": r2
    }

# Print results (or save to a file for later use)
print("\nModel Performance Summary:")
for name, metrics in results.items():
    print(f"--- {name} ---")
    print(f"  MAE: {metrics['MAE']:.4f}")
    print(f"  MSE: {metrics['MSE']:.4f}")
    print(f"  R2: {metrics['R2']:.4f}")

# Feature Importance for Random Forest
if "Random Forest Regressor" in models:
    rf_model = models["Random Forest Regressor"]
    importances = rf_model.feature_importances_
    feature_names = X.columns
    sorted_indices = np.argsort(importances)[::-1]

    print("\nRandom Forest Feature Importance:")
    for i in sorted_indices:
        print(f"  {feature_names[i]}: {importances[i]:.4f}")

# Create a combined DataFrame for red wine data for plotting
red_wine_df = pd.concat([X, y], axis=1)

# --- Create Visualizations ---
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (16, 12)

fig = plt.figure(figsize=(16, 12))

# Distribution of Alcohol (Feature Variable)
ax1 = plt.subplot(3, 3, 1)
sns.histplot(red_wine_df['alcohol'], bins=30, kde=True, color='darkred', ax=ax1)
ax1.set_title('1. Distribution of Alcohol Content', fontsize=12, fontweight='bold')
ax1.set_xlabel('Alcohol (%)')
ax1.set_ylabel('Frequency')

# Correlation Matrix
ax2 = plt.subplot(3, 3, 2)
corr_matrix = red_wine_df.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=False, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, ax=ax2)
ax2.set_title('2. Correlation Matrix', fontsize=12, fontweight='bold')

# Alcohol vs Density
ax3 = plt.subplot(3, 3, 3)
sns.scatterplot(data=red_wine_df, x='density', y='alcohol', alpha=0.5, color='darkblue', ax=ax3)
ax3.set_title('3. Alcohol vs Density', fontsize=12, fontweight='bold')
ax3.set_xlabel('Density')
ax3.set_ylabel('Alcohol (%)')

# Alcohol vs Quality
ax4 = plt.subplot(3, 3, 4)
sns.boxplot(data=red_wine_df, x='quality', y='alcohol', palette='Reds', ax=ax4)
ax4.set_title('4. Alcohol vs Quality', fontsize=12, fontweight='bold')
ax4.set_xlabel('Quality Score')
ax4.set_ylabel('Alcohol (%)')

# Alcohol vs Volatile Acidity
ax5 = plt.subplot(3, 3, 5)
sns.scatterplot(data=red_wine_df, x='volatile_acidity', y='alcohol', alpha=0.5, color='orange', ax=ax5)
ax5.set_title('5. Alcohol vs Volatile Acidity', fontsize=12, fontweight='bold')
ax5.set_xlabel('Volatile Acidity')
ax5.set_ylabel('Alcohol (%)')

# Alcohol vs Chlorides
ax6 = plt.subplot(3, 3, 6)
sns.scatterplot(data=red_wine_df, x='chlorides', y='alcohol', alpha=0.5, color='teal', ax=ax6)
ax6.set_title('6. Alcohol vs Chlorides', fontsize=12, fontweight='bold')
ax6.set_xlabel('Chlorides')
ax6.set_ylabel('Alcohol (%)')

# Alcohol vs Total Sulfur Dioxide
ax7 = plt.subplot(3, 3, 7)
sns.scatterplot(data=red_wine_df, x='total_sulfur_dioxide', y='alcohol', alpha=0.5, color='brown', ax=ax7)
ax7.set_title('7. Alcohol vs Total Sulfur Dioxide', fontsize=12, fontweight='bold')
ax7.set_xlabel('Total Sulfur Dioxide')
ax7.set_ylabel('Alcohol (%)')

plt.tight_layout()
output_filename = 'eda_visualizations.png'
plt.savefig(output_filename, dpi=300, bbox_inches='tight')
#########print(f"Visualizations saved to {output_filename}")

[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/commands/install.py", line 447, in run
    conflicts = self._determine_conflicts(to_install)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/commands/install.py", line 578, in _determine_conflicts
    return check_install_conflicts(to_install)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/operations/check.py", line 101, in check_install_conflicts
    package_set, _

KeyboardInterrupt: 