## House Price Prediction - Linear Regression

Public Score: 0.54

In [22]:
#!pip install pandas numpy scikit-learn

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error

In [14]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Target variable
y = train['SalePrice']

# Drop irrelevant features
train = train.drop(['Id', 'SalePrice'], axis=1)
test_ids = test['Id']
test = test.drop(['Id'], axis=1)

In [15]:
# Handle missing values (simple strategy: fill with median for numerical, mode for categorical)
for col in train.columns:
    if train[col].isnull().any() or test[col].isnull().any():
        if train[col].dtype == "object":
            # Fill categorical missing values with "Missing"
            train[col].fillna("Missing", inplace=True)
            test[col].fillna("Missing", inplace=True)
        else:
            # Fill numerical missing values with the median
            median = train[col].median()
            train[col].fillna(median, inplace=True)
            test[col].fillna(median, inplace=True)

In [16]:
# Encode categorical variables using Label Encoding
for col in train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0)  # Combine to ensure consistency
    le.fit(combined)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [17]:
# Feature Scaling
scaler = StandardScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

In [18]:
# Train/test split
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.2, random_state=42)

In [19]:
# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [20]:
# Predict on the validation set and calculate RMSE
y_pred = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print("Validation RMSE:", rmse)

# Train on the full dataset and predict on the test set
model.fit(train, y)
test_predictions = model.predict(test)

Validation RMSE: 34806.054616981295


In [23]:
# Prepare the submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})
submission.to_csv('submission_linear_regression.csv', index=False)
