Load the dataset and split it into training and testing sets.

In [None]:
import pandas as pd

data = pd.read_csv('data.csv')
train_data = data[data['DataType'] == 'train']
test_data = data[data['DataType'] == 'test']

Visualize the distribution of SalePrice in the training data.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(train_data['SalePrice'], kde=True)
plt.title('Distribution of SalePrice')
plt.show()

Handle missing values by filling them with the mean of the respective columns.

In [None]:
train_data.fillna(train_data.mean(), inplace=True)

Visualize the correlation between features using a heatmap.

In [None]:
correlation_matrix = train_data.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix')
plt.show()

Remove any unwanted features from the dataset.

In [None]:
train_data.drop(columns=['UnwantedFeature1', 'UnwantedFeature2'], inplace=True)

Transform features by scaling them to a standard format.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_scaled = scaler.fit_transform(train_data.drop('SalePrice', axis=1))

Encode categorical variables using one-hot encoding.

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
categorical_features = encoder.fit_transform(train_data[['CategoricalFeature']])

Split the dataset into features (X) and target (Y).

In [None]:
X = data_scaled
Y = train_data['SalePrice']

Further split the data into training and testing sets for the model.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Train a linear regression model on the training data.

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, Y_train)

Evaluate model performance using mean squared error.

In [None]:
from sklearn.metrics import mean_squared_error

predictions = model.predict(X_test)
error = mean_squared_error(Y_test, predictions)

Make predictions on the test dataset.

In [None]:
final_predictions = model.predict(test_data.drop('SalePrice', axis=1))

Prepare the submission file and save predictions to a CSV.

In [None]:
submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': final_predictions})
submission.to_csv('submission.csv', index=False)