In [1]:
#perform xgboost model to predic SalePrice......

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error



# fUNCTION TO HANDLE OUTLIERS
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df



# DATASET
train_df = pd.read_csv(r'C:\Users\nh013\Desktop\House Prices - Advanced Regression Techniques dataset\train.csv')
test_df = pd.read_csv(r'C:\Users\nh013\Desktop\House Prices - Advanced Regression Techniques dataset\test.csv')



# SEPERATE THE TARGET VARIABLE SALES PRICE FROM THE TRAIN DATASET 
target = train_df['SalePrice']




# TRANSFORM NEIGHBORHOOD INTO 0,1,2,3  BASED ON STATISTICS
neighborhood_stats = train_df.groupby('Neighborhood')['SalePrice'].median()
train_df['Neighborhood'] = train_df['Neighborhood'].map(lambda x: 0 if neighborhood_stats[x] < 150000 else (1 if neighborhood_stats[x] < 200000 else 2))

train_df.drop('SalePrice', axis=1, inplace=True)




# CONCATENATE TRAIN AND TEST  DATAFRAME  FOR PREPROCESS
all_data = pd.concat([train_df, test_df], ignore_index=True)



# IDENTIFY MISSINFG VALUES
missing_values = all_data.isnull().sum()


# SELECT_DTYPES() FUNCTION FROM PANDAS  TO  SELECT COLUMNS WITH  SPECIFIC DATA TYPE (INT64,FLOAT64) FROM THE ALL_DATA FRAME
#NUMERICAL

numeric_features = all_data.select_dtypes(include=['int64', 'float64']).columns



#SELECT_DTYPES() FUNCTION FROM PANDAS  TO  SELECT COLUMNS WITH  SPECIFIC DATA TYPE (INT64,FLOAT64) FROM THE ALL_DATA FRAME
#CATEGORICAL 

categorical_features = all_data.select_dtypes(include=['object']).columns



# FILL MISSING VALUES FOR NUMERIC FEATURES USING MEAN
all_data[numeric_features] = all_data[numeric_features].fillna(all_data[numeric_features].mean())



#  FILL MISSING VALUES FOR CATEGORICAL FEATURES USING MODE 
all_data[categorical_features] = all_data[categorical_features].fillna(all_data[categorical_features].mode().iloc[0])




# REMOVE DUPLICATES ROWS
all_data.drop_duplicates(inplace=True)


# FILL MISSING VALUES WITH FORWARD AND BACKWARD FILL
all_data.fillna(method='ffill', inplace=True)
all_data.fillna(method='bfill', inplace=True)



# HANDLE OUTLIERS IF SALE_PRICE IN NUMERIC CLUMN
if 'SalePrice' in numeric_features:
    all_data = handle_outliers(all_data, 'SalePrice')

    
    
# LET'S PERFORM FEATURE ENGINEERING  
all_data['IsGarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsFireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsSecondFloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsOpenPorch'] = all_data['OpenPorchSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsWoodDeck'] = all_data['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['TotalSqrtFeet'] = all_data['GrLivArea'] + all_data['TotalBsmtSF']
all_data['TotalBaths'] = all_data['BsmtFullBath'] + all_data['FullBath'] + all_data['BsmtHalfBath']/2 + all_data['HalfBath']/2


# ONE HOT ENCODING FOR CATEGORICAL DATA 
all_data = pd.get_dummies(all_data)


# NORMALIZE
scaler = StandardScaler()
all_data[numeric_features] = scaler.fit_transform(all_data[numeric_features])


# SPLIT THE DATA BACK  INTO TRAIN AND TEST DATAFRAME 
train_df = all_data.iloc[:len(train_df), :]
test_df = all_data.iloc[len(train_df):, :]


# SPLIT THE TRAIN DATASET INTO TRAIN AND VALIDATION SETS
X_train, X_val, y_train, y_val = train_test_split(train_df, target, test_size=0.2, random_state=42)


# MODELIZATION 
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)


# PREDICT USING TRAIN MODEL
train_predictions = xgb_model.predict(X_train)
val_predictions = xgb_model.predict(X_val)


# EVALUATE 
train_rmse = mean_squared_error(y_train, train_predictions, squared=False)
val_rmse = mean_squared_error(y_val, val_predictions, squared=False)


print("Training RMSE:", train_rmse)
print("Validation RMSE:", val_rmse)


# PREDICT SELES PRICE FOR THE TEST DATASET 
test_predictions = xgb_model.predict(test_df)


print("Test Predictions:")
print(test_predictions)


Training RMSE: 1182.705980776004
Validation RMSE: 31628.393675328505
Test Predictions:
[120744.86 143629.83 195179.44 ... 154669.16 114810.97 227237.33]


In [11]:
#apply GridSearchCV for overfitting reduce and hyperparameter tuning ..........

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error



# fUNCTION TO HANDLE OUTLIERS
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df




# DATASET
train_df = pd.read_csv(r'C:\Users\nh013\Desktop\House Prices - Advanced Regression Techniques dataset\train.csv')
test_df = pd.read_csv(r'C:\Users\nh013\Desktop\House Prices - Advanced Regression Techniques dataset\test.csv')




# SEPERATE THE TARGET VARIABLE SALES PRICE FROM THE TRAIN DATASET 
target = train_df['SalePrice']




# TRANSFORM NEIGHBORHOOD INTO 0,1,2,3  BASED ON STATISTICS
neighborhood_stats = train_df.groupby('Neighborhood')['SalePrice'].median()
train_df['Neighborhood'] = train_df['Neighborhood'].map(lambda x: 0 if neighborhood_stats[x] < 150000 else (1 if neighborhood_stats[x] < 200000 else 2))

train_df.drop('SalePrice', axis=1, inplace=True)




# CONCATENATE TRAIN AND TEST  DATAFRAME  FOR PREPROCESS
all_data = pd.concat([train_df, test_df], ignore_index=True)



# IDENTIFY MISSINFG VALUES
missing_values = all_data.isnull().sum()




# SELECT_DTYPES() FUNCTION FROM PANDAS  TO  SELECT COLUMNS WITH  SPECIFIC DATA TYPE (INT64,FLOAT64) FROM THE ALL_DATA FRAME
#NUMERICAL

numeric_features = all_data.select_dtypes(include=['int64', 'float64']).columns



#SELECT_DTYPES() FUNCTION FROM PANDAS  TO  SELECT COLUMNS WITH  SPECIFIC DATA TYPE (INT64,FLOAT64) FROM THE ALL_DATA FRAME
#CATEGORICAL 

categorical_features = all_data.select_dtypes(include=['object']).columns




# FILL MISSING VALUES FOR NUMERIC FEATURES USING MEAN
all_data[numeric_features] = all_data[numeric_features].fillna(all_data[numeric_features].mean())



#  FILL MISSING VALUES FOR CATEGORICAL FEATURES USING MODE 
all_data[categorical_features] = all_data[categorical_features].fillna(all_data[categorical_features].mode().iloc[0])




# REMOVE DUPLICATES ROWS
all_data.drop_duplicates(inplace=True)



# FILL MISSING VALUES WITH FORWARD AND BACKWARD FILL
all_data.fillna(method='ffill', inplace=True)
all_data.fillna(method='bfill', inplace=True)



# HANDLE OUTLIERS IF SALE_PRICE IN NUMERIC CLUMN
if 'SalePrice' in numeric_features:
    all_data = handle_outliers(all_data, 'SalePrice')

    
    
# LET'S PERFORM FEATURE ENGINEERING  
all_data['IsGarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsFireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsSecondFloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsOpenPorch'] = all_data['OpenPorchSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsWoodDeck'] = all_data['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['TotalSqrtFeet'] = all_data['GrLivArea'] + all_data['TotalBsmtSF']
all_data['TotalBaths'] = all_data['BsmtFullBath'] + all_data['FullBath'] + all_data['BsmtHalfBath']/2 + all_data['HalfBath']/2




# ONE HOT ENCODING FOR CATEGORICAL DATA 
all_data = pd.get_dummies(all_data)




# NORMALIZE
scaler = StandardScaler()
all_data[numeric_features] = scaler.fit_transform(all_data[numeric_features])




# SPLIT THE DATA BACK  INTO TRAIN AND TEST DATAFRAME 
train_df = all_data.iloc[:len(train_df), :]
test_df = all_data.iloc[len(train_df):, :]


# SPLIT THE TRAIN DATASET INTO TRAIN AND VALIDATION SETS
X_train, X_val, y_train, y_val = train_test_split(train_df, target, test_size=0.2, random_state=42)



# MODEL TRAINING WITH CROSS-VALIDATION
xgb_model = xgb.XGBRegressor()
params = {
    'alpha': [0.1, 0.5, 1.0],
    'lambda': [0.1, 0.5, 1.0]
}
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)




#THE BEST MODEL
best_model = grid_search.best_estimator_



#PREDICT USING TRAIN MODLE 
train_predictions = best_model.predict(X_train)
val_predictions = best_model.predict(X_val)



# EVALUATE 
train_rmse = mean_squared_error(y_train, train_predictions, squared=False)
val_rmse = mean_squared_error(y_val, val_predictions, squared=False)

print("Training RMSE:", train_rmse)
print("Validation RMSE:", val_rmse)



# PREDICT SELES PRICE FOR TEST DATASET 
test_predictions = best_model.predict(test_df)



#RETRIVE ORGINAL ID VALUSE FROM TRAINING DATASET 
train_ids = train_df['Id']


# GENERATE ID VALUSES  FOR THE TEST DATASET  STARTING FROM 1461
start_id = 1461
test_ids = pd.Series(range(start_id, start_id + len(test_df)))


# MAKE SUBMISSION DATAFRAME 
submission = pd.DataFrame()
submission['Id'] = test_ids
submission['SalePrice'] = test_predictions



# SET ID COL AS THE INDEX FOR SUBMISSION DATAFRAME 
submission.set_index('Id', inplace=True)



# SAVE SUBMISSION
submission.to_csv('submission.csv')

print("Submission file created successfully.")


Training RMSE: 773.034880294253
Validation RMSE: 29924.101487130258
Submission file created successfully.


In [13]:
#perform linear regression to predict sales price 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# fUNCTION TO HANDLE OUTLIERS
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df




# DATASET
train_df = pd.read_csv(r'C:\Users\nh013\Desktop\House Prices - Advanced Regression Techniques dataset\train.csv')
test_df = pd.read_csv(r'C:\Users\nh013\Desktop\House Prices - Advanced Regression Techniques dataset\test.csv')




# SEPERATE THE TARGET VARIABLE SALES PRICE FROM THE TRAIN DATASET 
target = train_df['SalePrice']




# TRANSFORM NEIGHBORHOOD INTO 0,1,2,3  BASED ON STATISTICS
neighborhood_stats = train_df.groupby('Neighborhood')['SalePrice'].median()
train_df['Neighborhood'] = train_df['Neighborhood'].map(lambda x: 0 if neighborhood_stats[x] < 150000 else (1 if neighborhood_stats[x] < 200000 else 2))

train_df.drop('SalePrice', axis=1, inplace=True)




# CONCATENATE TRAIN AND TEST  DATAFRAME  FOR PREPROCESS
all_data = pd.concat([train_df, test_df], ignore_index=True)



# IDENTIFY MISSINFG VALUES
missing_values = all_data.isnull().sum()




# SELECT_DTYPES() FUNCTION FROM PANDAS  TO  SELECT COLUMNS WITH  SPECIFIC DATA TYPE (INT64,FLOAT64) FROM THE ALL_DATA FRAME
#NUMERICAL

numeric_features = all_data.select_dtypes(include=['int64', 'float64']).columns



#SELECT_DTYPES() FUNCTION FROM PANDAS  TO  SELECT COLUMNS WITH  SPECIFIC DATA TYPE (INT64,FLOAT64) FROM THE ALL_DATA FRAME
#CATEGORICAL 

categorical_features = all_data.select_dtypes(include=['object']).columns




# FILL MISSING VALUES FOR NUMERIC FEATURES USING MEAN
all_data[numeric_features] = all_data[numeric_features].fillna(all_data[numeric_features].mean())



#  FILL MISSING VALUES FOR CATEGORICAL FEATURES USING MODE 
all_data[categorical_features] = all_data[categorical_features].fillna(all_data[categorical_features].mode().iloc[0])




# REMOVE DUPLICATES ROWS
all_data.drop_duplicates(inplace=True)



# FILL MISSING VALUES WITH FORWARD AND BACKWARD FILL
all_data.fillna(method='ffill', inplace=True)
all_data.fillna(method='bfill', inplace=True)



# HANDLE OUTLIERS IF SALE_PRICE IN NUMERIC CLUMN
if 'SalePrice' in numeric_features:
    all_data = handle_outliers(all_data, 'SalePrice')

    
    
# LET'S PERFORM FEATURE ENGINEERING  
all_data['IsGarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsFireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsSecondFloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsOpenPorch'] = all_data['OpenPorchSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsWoodDeck'] = all_data['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['TotalSqrtFeet'] = all_data['GrLivArea'] + all_data['TotalBsmtSF']
all_data['TotalBaths'] = all_data['BsmtFullBath'] + all_data['FullBath'] + all_data['BsmtHalfBath']/2 + all_data['HalfBath']/2




# ONE HOT ENCODING FOR CATEGORICAL DATA 
all_data = pd.get_dummies(all_data)




# NORMALIZE
scaler = StandardScaler()
all_data[numeric_features] = scaler.fit_transform(all_data[numeric_features])




# SPLIT THE DATA BACK  INTO TRAIN AND TEST DATAFRAME 
train_df = all_data.iloc[:len(train_df), :]
test_df = all_data.iloc[len(train_df):, :]


# SPLIT THE TRAIN AND TEST DATASET 
X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)


# MODELIZATION
model = LinearRegression()
model.fit(X_train, y_train)


# PREDICT ON TEST SET
y_pred = model.predict(X_test)

# EVALUATE 
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R^2 Score:', r2)


# PREDICT THE SALE PRICE FOR TEST SET
test_predictions = model.predict(test_df)

# MAKE SUBMISSION DATFRAME 
submission = pd.DataFrame()
submission['Id'] = range(1461, 1461 + len(test_df))
submission['SalePrice'] = test_predictions


submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")


Mean Squared Error: 4595568566.300028
R^2 Score: 0.4008639995005737
Submission file created successfully.


In [14]:
#Ridge regression is used instead of Linear regression to reduce overfitting

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# fUNCTION TO HANDLE OUTLIERS
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df




# DATASET
train_df = pd.read_csv(r'C:\Users\nh013\Desktop\House Prices - Advanced Regression Techniques dataset\train.csv')
test_df = pd.read_csv(r'C:\Users\nh013\Desktop\House Prices - Advanced Regression Techniques dataset\test.csv')




# SEPERATE THE TARGET VARIABLE SALES PRICE FROM THE TRAIN DATASET 
target = train_df['SalePrice']




# TRANSFORM NEIGHBORHOOD INTO 0,1,2,3  BASED ON STATISTICS
neighborhood_stats = train_df.groupby('Neighborhood')['SalePrice'].median()
train_df['Neighborhood'] = train_df['Neighborhood'].map(lambda x: 0 if neighborhood_stats[x] < 150000 else (1 if neighborhood_stats[x] < 200000 else 2))

train_df.drop('SalePrice', axis=1, inplace=True)




# CONCATENATE TRAIN AND TEST  DATAFRAME  FOR PREPROCESS
all_data = pd.concat([train_df, test_df], ignore_index=True)



# IDENTIFY MISSINFG VALUES
missing_values = all_data.isnull().sum()




# SELECT_DTYPES() FUNCTION FROM PANDAS  TO  SELECT COLUMNS WITH  SPECIFIC DATA TYPE (INT64,FLOAT64) FROM THE ALL_DATA FRAME
#NUMERICAL

numeric_features = all_data.select_dtypes(include=['int64', 'float64']).columns



#SELECT_DTYPES() FUNCTION FROM PANDAS  TO  SELECT COLUMNS WITH  SPECIFIC DATA TYPE (INT64,FLOAT64) FROM THE ALL_DATA FRAME
#CATEGORICAL 

categorical_features = all_data.select_dtypes(include=['object']).columns




# FILL MISSING VALUES FOR NUMERIC FEATURES USING MEAN
all_data[numeric_features] = all_data[numeric_features].fillna(all_data[numeric_features].mean())



#  FILL MISSING VALUES FOR CATEGORICAL FEATURES USING MODE 
all_data[categorical_features] = all_data[categorical_features].fillna(all_data[categorical_features].mode().iloc[0])




# REMOVE DUPLICATES ROWS
all_data.drop_duplicates(inplace=True)



# FILL MISSING VALUES WITH FORWARD AND BACKWARD FILL
all_data.fillna(method='ffill', inplace=True)
all_data.fillna(method='bfill', inplace=True)



# HANDLE OUTLIERS IF SALE_PRICE IN NUMERIC CLUMN
if 'SalePrice' in numeric_features:
    all_data = handle_outliers(all_data, 'SalePrice')

    
    
# LET'S PERFORM FEATURE ENGINEERING  
all_data['IsGarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsFireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsSecondFloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsOpenPorch'] = all_data['OpenPorchSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsWoodDeck'] = all_data['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['TotalSqrtFeet'] = all_data['GrLivArea'] + all_data['TotalBsmtSF']
all_data['TotalBaths'] = all_data['BsmtFullBath'] + all_data['FullBath'] + all_data['BsmtHalfBath']/2 + all_data['HalfBath']/2




# ONE HOT ENCODING FOR CATEGORICAL DATA 
all_data = pd.get_dummies(all_data)




# NORMALIZE
scaler = StandardScaler()
all_data[numeric_features] = scaler.fit_transform(all_data[numeric_features])




# SPLIT THE DATA BACK  INTO TRAIN AND TEST DATAFRAME 
train_df = all_data.iloc[:len(train_df), :]
test_df = all_data.iloc[len(train_df):, :]



# SPLIT TRAN AND TEST SET 
X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)



# MAKE AND TRAIN RIDGES REGRESSION MODEL WITH THE HYPERPAREMETER TUNING
params = {'alpha': [0.01, 0.1, 1.0, 10.0]}
ridge = Ridge()
grid_search = GridSearchCV(ridge, params, cv=5)
grid_search.fit(X_train, y_train)
best_alpha = grid_search.best_params_['alpha']

# TRAIN MODEL WITH THE BEST ALPHA 
model = Ridge(alpha=best_alpha)
model.fit(X_train, y_train)

# PREDICT ON TEST SET 
y_pred = model.predict(X_test)

# EVALUATE 
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R^2 Score:', r2)

# PREDICT THE SALES PRICE FOR THE TEST DATFRAME 
test_predictions = model.predict(test_df)


# MAKE SUBMISSION DATFRAME 
submission = pd.DataFrame()
submission['Id'] = range(1461, 1461 + len(test_df))
submission['SalePrice'] = test_predictions


submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")


Mean Squared Error: 1000883098.3437394
R^2 Score: 0.8695123165158337
Submission file created successfully.


In [15]:
#perform gredient boosting regressor model ....to predict sales price 

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Function to handle outliers
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df


# DATASET
train_df = pd.read_csv(r'C:\Users\nh013\Desktop\House Prices - Advanced Regression Techniques dataset\train.csv')
test_df = pd.read_csv(r'C:\Users\nh013\Desktop\House Prices - Advanced Regression Techniques dataset\test.csv')

# SEPARATE THE TARGET VARIABLE 'SalePrice' FROM THE TRAIN DATASET
target = train_df['SalePrice']

# TRANSFORM 'Neighborhood' INTO 0, 1, 2, 3 BASED ON STATISTICS
neighborhood_stats = train_df.groupby('Neighborhood')['SalePrice'].median()
train_df['Neighborhood'] = train_df['Neighborhood'].map(lambda x: 0 if neighborhood_stats[x] < 150000 else (1 if neighborhood_stats[x] < 200000 else 2))

train_df.drop('SalePrice', axis=1, inplace=True)

# CONCATENATE TRAIN AND TEST DATAFRAMES FOR PREPROCESSING
all_data = pd.concat([train_df, test_df], ignore_index=True)

# IDENTIFY MISSING VALUES
missing_values = all_data.isnull().sum()

# SELECT COLUMNS WITH SPECIFIC DATA TYPES (int64, float64) FROM THE ALL_DATA DATAFRAME (NUMERIC)
numeric_features = all_data.select_dtypes(include=['int64', 'float64']).columns

# SELECT COLUMNS WITH SPECIFIC DATA TYPES (object) FROM THE ALL_DATA DATAFRAME (CATEGORICAL)
categorical_features = all_data.select_dtypes(include=['object']).columns

# FILL MISSING VALUES FOR NUMERIC FEATURES USING MEAN
all_data[numeric_features] = all_data[numeric_features].fillna(all_data[numeric_features].mean())

# FILL MISSING VALUES FOR CATEGORICAL FEATURES USING MODE
all_data[categorical_features] = all_data[categorical_features].fillna(all_data[categorical_features].mode().iloc[0])

# REMOVE DUPLICATE ROWS
all_data.drop_duplicates(inplace=True)

# FILL MISSING VALUES WITH FORWARD AND BACKWARD FILL
all_data.fillna(method='ffill', inplace=True)
all_data.fillna(method='bfill', inplace=True)

# HANDLE OUTLIERS IF 'SalePrice' IS IN NUMERIC FEATURES
if 'SalePrice' in numeric_features:
    all_data = handle_outliers(all_data, 'SalePrice')

# FEATURE ENGINEERING
all_data['IsGarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsFireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsSecondFloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsOpenPorch'] = all_data['OpenPorchSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['IsWoodDeck'] = all_data['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['TotalSqrtFeet'] = all_data['GrLivArea'] + all_data['TotalBsmtSF']
all_data['TotalBaths'] = all_data['BsmtFullBath'] + all_data['FullBath'] + all_data['BsmtHalfBath'] / 2 + all_data['HalfBath'] / 2

# ONE-HOT ENCODING FOR CATEGORICAL DATA
all_data = pd.get_dummies(all_data)

# NORMALIZE
scaler = StandardScaler()
all_data[numeric_features] = scaler.fit_transform(all_data[numeric_features])

# SPLIT THE DATA BACK INTO TRAIN AND TEST DATAFRAMES
train_df = all_data.iloc[:len(train_df), :]
test_df = all_data.iloc[len(train_df):, :]

# PREPARE TRAIN AND TEST SETS
X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)

# GRADIENT BOOSTING REGRESSOR
params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 4, 5]
}

gb_regressor = GradientBoostingRegressor()

# GRID SEARCH FOR HYPERPARAMETER TUNING
grid_search = GridSearchCV(gb_regressor, params, cv=5)
grid_search.fit(X_train, y_train)

# BEST ESTIMATOR
best_gb_regressor = grid_search.best_estimator_

# PREDICT ON THE TEST SET
y_pred = best_gb_regressor.predict(X_test)

# EVALUATE THE MODEL
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R^2 Score:', r2)

# PREDICT THE SALE PRICES FOR THE TEST DATASET
test_predictions = best_gb_regressor.predict(test_df)

# CREATE SUBMISSION DATAFRAME
submission = pd.DataFrame()
submission['Id'] = range(1461, 1461 + len(test_df))
submission['SalePrice'] = test_predictions


submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")


Mean Squared Error: 722475960.4532174
R^2 Score: 0.9058089654940289
Submission file created successfully.
