# Housing Analysis Script

## Data Exploration

```python
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

# File path constant
FILE_PATH = "C:\\Users\\mahlo\\OneDrive\\Documents\\HousingAnalysis\\Housing.csv"

# Load the dataset
df = pd.read_csv(FILE_PATH)

# Display the first few rows of the dataset
df.head()


Exploratory Data Analysis (EDA)

In [None]:
# Summary statistics for numerical features
df.describe()


In [None]:
# Check for missing values
df.isnull().sum()


In [None]:
# Visualize numerical features
numerical_features = ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking']
for feature in numerical_features:
    sns.histplot(df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.show()


In [None]:
# Visualize categorical features
categorical_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
for feature in categorical_features:
    sns.countplot(x=feature, data=df)
    plt.title(f'Count of Houses by {feature}')
    plt.show()


In [None]:
# Visualize relationships between features and the target variable
sns.pairplot(df, x_vars=numerical_features, y_vars='price', height=5, aspect=0.7)
plt.suptitle('Relationships between Numerical Features and Price')
plt.show()


In [None]:
# Boxplot for categorical variables against price
for feature in categorical_features:
    sns.boxplot(x=feature, y='price', data=df)
    plt.title(f'Boxplot of Price by {feature}')
    plt.show()


Simple Linear Regression

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt

# Load the dataset
file_path = "C:/Users/mahlo/OneDrive/Documents/HousingAnalysis/Housing.csv"
housing_data = pd.read_csv(file_path)


In [None]:
# Select predictor (X) and target (y) variables
X = housing_data[['area']]
y = housing_data['price']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Create a linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = round((mse ** 0.5), 2)


In [None]:
# Plot the regression line
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
plt.title('Simple Linear Regression')
plt.xlabel('Area')
plt.ylabel('Price')
plt.show()


In [None]:
# Display model information
print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')


Multiple Linear Regression

In [None]:
# Multiple Linear Regression Model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np  # Import NumPy

# Assuming your DataFrame is named 'df'
# Selecting features and target variable
features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
X = df[features]
y = df['price']


In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and fitting the multiple linear regression model
multiple_model = LinearRegression()
multiple_model.fit(X_train, y_train)


In [None]:
# Making predictions
y_pred_multiple = multiple_model.predict(X_test)

# Evaluating the model
print('Multiple Linear Regression Metrics:')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_multiple))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_multiple))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_multiple)))


In [None]:
# Coefficients and intercept
print('\nCoefficients:', multiple_model.coef_)
print('Intercept:', multiple_model.intercept_)


In [None]:
# Visualize predicted vs. actual values for multiple linear regression
plt.scatter(y_test, y_pred_multiple)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs. Predicted Prices (Multiple Linear Regression)')
plt.show()


In [None]:
# Display multiple linear regression results and evaluation metrics
print('Multiple Linear Regression Metrics:')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_multiple))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_multiple))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_multiple)))


In [None]:
# Coefficients and intercept
print('\nCoefficients:', multiple_model.coef_)
print('Intercept:', multiple_model.intercept_)
