## Project 2
#### Group: Manas Gandhi, Neeya Devanagondi, Rahul Kasibhatla

In [1]:
# Load & Inspect Data
# =========================
# 1. Load & Inspect Data
# =========================

import pandas as pd
import numpy as np

# Load Excel file
df = pd.read_excel("Concrete_Data.xls")

# Preview
print("First 5 rows:")
display(df.head())

print("\nDataset Shape:", df.shape)

print("\nData Types and Null Values:")
display(df.info())

print("\nSummary Statistics:")
display(df.describe())

# Check NA values
print("\nMissing Values per Column:")
print(df.isna().sum())

# Check for exact duplicate rows
print("\nNumber of Duplicate Rows:", df.duplicated().sum())

# Check for zero values (sometimes zeros are placeholders for missing data)
print("\nZero Counts per Column:")
print((df == 0).sum())


ImportError: Missing optional dependency 'xlrd'. Install xlrd >= 1.0.0 for Excel support Use pip or conda to install xlrd.

In [None]:
# Preprocess Data

# Drop duplicate rows
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]

print(f"✅ Removed {before - after} duplicate rows. Clean dataset shape: {df.shape}")

# Confirm no further cleaning needed
print("\nRemaining Missing Values:")
print(df.isna().sum())

print("\nZero values are kept as valid ingredient absences.")
df.head()

In [None]:
# Train and Test Split 70/30, with seed = 598 
from sklearn.model_selection import train_test_split
df.columns = df.columns.str.strip()

X = df.drop(columns=["Concrete compressive strength(MPa, megapascals)"])
y = df["Concrete compressive strength(MPa, megapascals)"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=598
)

print("✅ Train/Test Split Complete")
print("Training Set Shape:", X_train.shape, y_train.shape)
print("Testing Set Shape:", X_test.shape, y_test.shape)

In [None]:
# Polynomial regression model

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt

# In case you want to accumulate results across models:
try:
    results
except NameError:
    results = []

cv = KFold(n_splits=5, shuffle=True, random_state=598)

poly_pipe = Pipeline([
    ("poly", PolynomialFeatures(include_bias=False)),
    ("scaler", StandardScaler()),
    ("reg", LinearRegression())
])

param_grid_poly = {
    "poly__degree": [1, 2, 3, 4]
}

gs_poly = GridSearchCV(
    estimator=poly_pipe,
    param_grid=param_grid_poly,
    scoring="neg_mean_squared_error",
    cv=cv,
    n_jobs=-1,
    refit=True
)

gs_poly.fit(X_train, y_train)

# Evaluate on test set
y_pred_poly = gs_poly.best_estimator_.predict(X_test)
mse_poly = mean_squared_error(y_test, y_pred_poly)

print("Best params (Polynomial):", gs_poly.best_params_)
print("CV MSE (best):", -gs_poly.best_score_)
print("Test MSE (Polynomial):", mse_poly)

# Save to results list for later comparison table
results.append(("Polynomial Regression", gs_poly.best_params_, float(mse_poly)))

# Optional: quick diagnostic plot
plt.figure(figsize=(5,4))
plt.scatter(y_test, y_pred_poly, alpha=0.6)
plt.xlabel("Actual Strength (MPa)")
plt.ylabel("Predicted Strength (MPa)")
plt.title("Polynomial Regression: Predicted vs Actual")
lims = [min(y_test.min(), y_pred_poly.min()), max(y_test.max(), y_pred_poly.max())]
plt.plot(lims, lims)  # y=x line
plt.tight_layout()
plt.show()


In [None]:
# Splines model

In [None]:
# Regression Tree mode

In [None]:
# Random Forest model

Model Interpretations and Comparison :