In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Initialize df to None
df = None

# Load the dataset without a header, assuming the last column is the target
try:
    # Load the CSV without a header, so pandas assigns default integer column names (0, 1, 2, ...)
    df = pd.read_csv('data.csv', header=None)
    print("Dataset loaded successfully without header.")

    # Dynamically determine the number of columns
    num_columns = df.shape[1]

    # Create a list of new column names: 'feature_0', 'feature_1', ..., 'feature_N-2', 'price'
    # The last column (index num_columns - 1) will be named 'price'
    new_column_names = [f'feature_{i}' for i in range(num_columns - 1)] + ['price']
    df.columns = new_column_names
    print(f"Columns renamed to: {new_column_names}")

except FileNotFoundError:
    print("Error: data.csv not found. Please ensure the file is in the correct directory.")
    print("Exiting due to data loading error.")
    exit() # Exit the script if the file is not found
except Exception as e:
    print(f"An unexpected error occurred while loading the dataset: {e}")
    print("Exiting due to data loading error.")
    exit()

# Display the first few rows of the dataframe with new column names
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Display basic information about the dataset
print("\nDataset Info:")
df.info()

# Display descriptive statistics
print("\nDescriptive Statistics:")
print(df.describe())

# --- Data Preprocessing ---

# Handle missing values
# For numerical columns, fill with the median
for column in df.select_dtypes(include=np.number).columns:
    if df[column].isnull().sum() > 0:
        median_val = df[column].median()
        df[column].fillna(median_val, inplace=True)
        print(f"Filled missing values in numerical column '{column}' with median: {median_val}")

# For categorical columns, fill with the mode
# In this specific dataset (Boston House Price-like), all columns are numerical,
# but this loop is kept for robustness in case of other datasets.
for column in df.select_dtypes(include='object').columns:
    if df[column].isnull().sum() > 0:
        mode_val = df[column].mode()[0]
        df[column].fillna(mode_val, inplace=True)
        print(f"Filled missing values in categorical column '{column}' with mode: {mode_val}")

print("\nMissing values after handling:")
print(df.isnull().sum())

# Encode categorical features
# Given the dataset is purely numerical, this section might not apply,
# but it's good practice to keep it for general machine learning tasks.
categorical_cols = df.select_dtypes(include='object').columns
if not categorical_cols.empty:
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        print(f"Encoded categorical column: {col}")
else:
    print("\nNo categorical columns found to encode.")


# Define features (X) and target (y)
# Now 'price' column is guaranteed to exist after renaming
if 'price' in df.columns:
    X = df.drop('price', axis=1)
    y = df['price']
    print("\nFeatures (X) and Target (y) defined.")
else:
    # This block should ideally not be reached with the new loading logic
    print("Critical Error: 'price' column not found after renaming. Please check data loading logic.")
    exit()

# Split the data into training and testing sets
# Using a test size of 20% and a random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nData split into training (80%) and testing (20%) sets.")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

# --- Model Training ---

# Initialize the RandomForestRegressor model
# Using default parameters for simplicity, but these can be tuned
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
print("\nRandomForestRegressor model initialized.")

# Train the model
print("Training the model...")
model.fit(X_train, y_train)
print("Model training complete.")

# --- Model Evaluation ---

# Make predictions on the test set
y_pred = model.predict(X_test)
print("\nPredictions made on the test set.")

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) # Root Mean Squared Error
r2 = r2_score(y_test, y_pred)

print("\n--- Model Evaluation Results ---")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

# Feature Importance (optional)
print("\n--- Feature Importances ---")
feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importances)

# Example of making a single prediction (optional)
# You would replace these values with actual data for a new house
# Ensure the order and type of features match your training data
# For demonstration, let's take the first row of the test set
if not X_test.empty:
    sample_house = X_test.iloc[[0]]
    predicted_price = model.predict(sample_house)[0]
    actual_price = y_test.iloc[0]
    print(f"\n--- Example Prediction ---")
    print(f"Sample House Features:\n{sample_house.to_string()}")
    print(f"Actual Price: {actual_price:.2f}")
    print(f"Predicted Price: {predicted_price:.2f}")
else:
    print("\nNo test data available for example prediction.")


Dataset loaded successfully without header.
Columns renamed to: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'price']

First 5 rows of the dataset:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0    0.00632       18.0       2.31          0      0.538      6.575   
1    0.02731        0.0       7.07          0      0.469      6.421   
2    0.02729        0.0       7.07          0      0.469      7.185   
3    0.03237        0.0       2.18          0      0.458      6.998   
4    0.06905        0.0       2.18          0      0.458      7.147   

   feature_6  feature_7  feature_8  feature_9  feature_10  feature_11  \
0       65.2     4.0900          1        296        15.3      396.90   
1       78.9     4.9671          2        242        17.8      396.90   
2       61.1     4.9671          2        242        17.8      392.83   
3    