## House Price Prediction

### Public Scores: 
- Linear Regression: 0.543
- Decision Tree: 0.216
- Random Forest: 0.150
- Gradient Boosting Machine: 0.138
- Extreme Gradient Boosting: 0.142
- Neural Network: 0.634
- Stacking Techniques: 0.142

In [22]:
#!pip install pandas numpy scikit-learn

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error

In [14]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Target variable
y = train['SalePrice']

# Drop irrelevant features
train = train.drop(['Id', 'SalePrice'], axis=1)
test_ids = test['Id']
test = test.drop(['Id'], axis=1)

In [15]:
# Handle missing values (simple strategy: fill with median for numerical, mode for categorical)
for col in train.columns:
    if train[col].isnull().any() or test[col].isnull().any():
        if train[col].dtype == "object":
            # Fill categorical missing values with "Missing"
            train[col].fillna("Missing", inplace=True)
            test[col].fillna("Missing", inplace=True)
        else:
            # Fill numerical missing values with the median
            median = train[col].median()
            train[col].fillna(median, inplace=True)
            test[col].fillna(median, inplace=True)

In [16]:
# Encode categorical variables using Label Encoding
for col in train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0)  # Combine to ensure consistency
    le.fit(combined)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [17]:
# Feature Scaling
scaler = StandardScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

In [18]:
# Train/test split
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.2, random_state=42)

### Linear Regression

In [19]:
# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [20]:
# Predict on the validation set and calculate RMSE
y_pred = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print("Validation RMSE:", rmse)

# Train on the full dataset and predict on the test set
model.fit(train, y)
test_predictions = model.predict(test)

Validation RMSE: 34806.054616981295


In [23]:
# Prepare the submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})
submission.to_csv('submission_linear_regression.csv', index=False)


### Decision Tree

In [24]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=5)  # Limit depth to prevent overfitting
model.fit(X_train, y_train)
test_predictions = model.predict(test)

# Prepare the submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})
submission.to_csv('submission_decision_tree.csv', index=False)

### Random Forest

In [27]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)
test_predictions = model.predict(test)

# Prepare the submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})
submission.to_csv('submission_random_forest.csv', index=False)

### Gradient Boosting Machines (GBM)

In [28]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)
test_predictions = model.predict(test)

# Prepare the submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})
submission.to_csv('submission_gbm.csv', index=False)

### XGBoost (Extreme Gradient Boosting)

In [30]:
#!pip install xgboost
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=4, random_state=42)
model.fit(X_train, y_train)
test_predictions = model.predict(test)

# Prepare the submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})
submission.to_csv('submission_xgb.csv', index=False)

### Neural Networks

In [36]:
#!pip install tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

# y_train = y_train.values.squeeze()

model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)
test_predictions = model.predict(test).squeeze()

# Prepare the submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})
submission.to_csv('submission_nn.csv', index=False)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Stacking Techniques

In [38]:
from sklearn.ensemble import StackingRegressor

estimators = [
    ('rf', RandomForestRegressor(n_estimators=50, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=50, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=50, random_state=42))
]
model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
model.fit(X_train, y_train)
test_predictions = model.predict(test)

# Prepare the submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})
submission.to_csv('submission_stacking.csv', index=False)
