In [None]:
Step 1: Understand the Data
Look at your dataset and identify columns.
Identify target variable (what you want to predict) → price.
Identify features (variables that may influence price) → e.g., bedrooms, bathrooms, sqft_living, floors, yr_built, etc.

Step 2: Clean the Data
Check for missing values or errors and decide how to handle them (remove rows, fill with average, etc.).
Ensure all selected features are in numerical format.
Convert dates or categorical variables to numbers if needed.

Step 3: Select Features
Choose features that are likely to impact the house price.
Exclude irrelevant information like street, statezip, or country (unless you plan to encode them).

Step 4: Split Data
Divide the dataset into training data (to train the model) and testing data (to evaluate performance).
Usually, 70–80% for training and 20–30% for testing.

Step 5: Train Linear Regression Model
Fit a linear regression model using the training data.
The model finds the relationship between features and the price.

Step 6: Make Predictions
Use the trained model to predict prices on the testing data.
This helps you see how well the model generalizes to new data.

Step 7: Evaluate the Model
Check how accurate the predictions are:
Mean Squared Error (MSE) → average error size
R² score → how much variance in price is explained by your model
Higher R² and lower MSE indicate a better model.

Step 8: Interpret Results
Look at the coefficients for each feature.
Positive → increases price
Negative → decreases price
This tells you which features matter most in determining house price.

In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

In [10]:
file_path = os.path.join(path, 'data.csv')
df = pd.read_csv(file_path)
df

NameError: name 'os' is not defined

In [11]:
df.info()

NameError: name 'df' is not defined

In [12]:


# ==========================================================
# 3. REMOVE UNNECESSARY COLUMNS
# ==========================================================
columns_to_drop = ["id", "date", "street", "city", "statezip", "country"]

df = df.drop(columns=columns_to_drop, errors="ignore")

print("After dropping unnecessary columns:", df.shape)

# ==========================================================
# 4. CLEAN MISSING VALUES
# ==========================================================
df = df.dropna()   # Drop missing rows
print("After dropping missing values:", df.shape)

# ==========================================================
# 5. SELECT FEATURES + TARGET
# ==========================================================
target = "price"
features = df.columns.drop(target)

X = df[features]
y = df[target]

print("Features used for training:", list(features))

# ==========================================================
# 6. TRAIN/TEST SPLIT
# ==========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

# ==========================================================
# 7. TRAIN LINEAR REGRESSION MODEL
# ==========================================================
model = LinearRegression()
model.fit(X_train, y_train)

# ==========================================================
# 8. MAKE PREDICTIONS
# ==========================================================
y_pred = model.predict(X_test)

# ==========================================================
# 9. EVALUATE MODEL
# ==========================================================
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

# ==========================================================
# 10. INTERPRET COEFFICIENTS
# ==========================================================
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": model.coef_
}).sort_values(by="Coefficient", ascending=False)

print("\nFeature Importance (Coefficients):")
print(coef_df)



NameError: name 'df' is not defined

In [5]:
df = df[df['price'] < df['price'].quantile(0.99)]
df

NameError: name 'df' is not defined

In [6]:
# ==========================================================
# 3. REMOVE UNNECESSARY COLUMNS
# ==========================================================
columns_to_drop = ["id", "date", "street", "city", "statezip", "country"]

df = df.drop(columns=columns_to_drop, errors="ignore")

print("After dropping unnecessary columns:", df.shape)

# ==========================================================
# 4. CLEAN MISSING VALUES
# ==========================================================
df = df.dropna()   # Drop missing rows
print("After dropping missing values:", df.shape)

# ==========================================================
# 5. SELECT FEATURES + TARGET
# ==========================================================
target = "price"
features = df.columns.drop(target)

X = df[features]
y = df[target]

print("Features used for training:", list(features))

# ==========================================================
# 6. TRAIN/TEST SPLIT
# ==========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

# ==========================================================
# 7. TRAIN LINEAR REGRESSION MODEL
# ==========================================================
model = LinearRegression()
model.fit(X_train, y_train)

# ==========================================================
# 8. MAKE PREDICTIONS
# ==========================================================
y_pred = model.predict(X_test)

# ==========================================================
# 9. EVALUATE MODEL
# ==========================================================
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

# ==========================================================
# 10. INTERPRET COEFFICIENTS
# ==========================================================
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": model.coef_
}).sort_values(by="Coefficient", ascending=False)

print("\nFeature Importance (Coefficients):")
print(coef_df)



NameError: name 'df' is not defined

In [7]:
from scipy import stats
import numpy as np

z_scores = np.abs(stats.zscore(df['price']))
outliers = df[z_scores > 3]
print("عدد القيم الشاذة:", outliers.shape[0])


NameError: name 'df' is not defined

In [None]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

outliers = df[(df['price'] < Q1 - 1.5*IQR) | (df['price'] > Q3 + 1.5*IQR)]
print("عدد القيم الشاذة في السعر:", outliers.shape[0])


In [None]:
# Remove zero or negative prices before log-transform
df = df[df["price"] > 0]

# Log-transform price
y = np.log(df["price"])