In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')


# --- Evaluate with RMSLE ---
def rmsle(y_true, y_pred):
    # Clip predictions to avoid log(0) errors
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))


df_train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
df_train.head()

In [None]:
df_train.columns = df_train.columns.str.replace(" ", "_").str.replace(".", "_")
X1 = df_train.drop(columns=['id', 'Sex', 'Rings'])
y = df_train['Rings']
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=55)
model1 = LinearRegression().fit(X1_train, y_train)
rmsle1 = rmsle(y_test, model1.predict(X1_test))
print(f"Model 1 RMSLE (no Sex): {rmsle1:.4f}")

In [None]:
# --- Assumption Checks for Model ---
# Add constant for statsmodels
X1_train_const = sm.add_constant(X1_train)
ols_model1 = sm.OLS(y_train, X1_train_const).fit()
print(ols_model1.summary())

In [None]:
# Residuals
residuals1 = y_train - ols_model1.predict(X1_train_const)
# Linearity & Homoscedasticity
plt.figure(figsize=(6, 4))
sns.scatterplot(x=ols_model1.predict(X1_train_const), y=residuals1)
plt.axhline(0, color='red', linestyle='--')
plt.title("Residuals vs Fitted (1)")
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.tight_layout()
plt.show()

In [None]:
# Normality of residuals
plt.figure(figsize=(6, 4))
sns.histplot(residuals1, kde=True)
plt.title("Histogram of Residuals (1)")
plt.tight_layout()
plt.show()

In [None]:
# Multicollinearity (VIF)
vif_data1 = pd.DataFrame()
vif_data1["Feature"] = X1_train.columns
vif_data1["VIF"] = [variance_inflation_factor(X1_train.values, i) for i in range(X1_train.shape[1])]
print(vif_data1)

In [None]:
# --- Load test data ---
test_df = pd.read_csv("/kaggle/input/playground-series-s4e4/test.csv")
test_ids = test_df["id"]

# Clean column names
test_df.columns = test_df.columns.str.replace(" ", "_").str.replace(".", "_")

# --- Predict using Model 1 (no Sex) ---
X_test_model1 = test_df.drop(columns=['id', 'Sex'])
preds_model1 = model1.predict(X_test_model1)
preds_model1 = np.maximum(0, preds_model1)  # Ensure no negative predictions

# --- Save submission files ---
submission1 = pd.DataFrame({'id': test_ids, 'Rings': preds_model1})
submission1.to_csv("/kaggle/working/submission_model1.csv", index=False)

print("Submissions saved: submission_model1.csv")