In [1]:
import pandas as pd 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import MaxAbsScaler,OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from math import sqrt
import concurrent.futures

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

## Handle Nan

In [4]:
print(train.isna().sum().sum())
print(test.isna().sum().sum())

6965
7000


### train_Nan

In [5]:
for c in train.columns:
    if(train[c].isna().sum()>0):
        print(train[c].name+" "+str(train[c].isna().mean()*100))

LotFrontage 17.73972602739726
Alley 93.76712328767123
MasVnrType 0.547945205479452
MasVnrArea 0.547945205479452
BsmtQual 2.5342465753424657
BsmtCond 2.5342465753424657
BsmtExposure 2.6027397260273974
BsmtFinType1 2.5342465753424657
BsmtFinType2 2.6027397260273974
Electrical 0.0684931506849315
FireplaceQu 47.26027397260274
GarageType 5.5479452054794525
GarageYrBlt 5.5479452054794525
GarageFinish 5.5479452054794525
GarageQual 5.5479452054794525
GarageCond 5.5479452054794525
PoolQC 99.52054794520548
Fence 80.75342465753424
MiscFeature 96.30136986301369


### test_Nan

In [6]:
for c in test.columns:
    if(test[c].isna().sum()>0):
        print(test[c].name+" "+str(test[c].isna().mean()*100))

MSZoning 0.2741603838245374
LotFrontage 15.558601782042494
Alley 92.66620973269363
Utilities 0.1370801919122687
Exterior1st 0.06854009595613435
Exterior2nd 0.06854009595613435
MasVnrType 1.0966415352981496
MasVnrArea 1.0281014393420151
BsmtQual 3.015764222069911
BsmtCond 3.0843043180260454
BsmtExposure 3.015764222069911
BsmtFinType1 2.878684030157642
BsmtFinSF1 0.06854009595613435
BsmtFinType2 2.878684030157642
BsmtFinSF2 0.06854009595613435
BsmtUnfSF 0.06854009595613435
TotalBsmtSF 0.06854009595613435
BsmtFullBath 0.1370801919122687
BsmtHalfBath 0.1370801919122687
KitchenQual 0.06854009595613435
Functional 0.1370801919122687
FireplaceQu 50.03427004797807
GarageType 5.20904729266621
GarageYrBlt 5.346127484578479
GarageFinish 5.346127484578479
GarageCars 0.06854009595613435
GarageArea 0.06854009595613435
GarageQual 5.346127484578479
GarageCond 5.346127484578479
PoolQC 99.7943797121316
Fence 80.12337217272105
MiscFeature 96.50445510623715
SaleType 0.06854009595613435


In [7]:
from sklearn.impute import SimpleImputer

train.drop(['PoolQC','MiscFeature','Alley'],axis=1,inplace=True)
test.drop(['PoolQC','MiscFeature','Alley'],axis=1,inplace=True)

non_numeric_train_columns = train.select_dtypes(exclude=['number']).columns
numeric_train_columns     = train.select_dtypes(exclude=['object']).columns

non_numeric_test_columns = test.select_dtypes(exclude=['number']).columns
numeric_test_columns     = test.select_dtypes(exclude=['object']).columns

Numeric_Imputer=SimpleImputer(strategy="mean")
Object_Imputer=SimpleImputer(strategy="most_frequent")

for c in non_numeric_train_columns:
    train[c]=Object_Imputer.fit_transform(train[c].values.reshape(-1,1))

for c in non_numeric_test_columns:
    test[c]=Object_Imputer.fit_transform(test[c].values.reshape(-1,1))    

for c in numeric_train_columns:
    train[c]=Numeric_Imputer.fit_transform(train[c].values.reshape(-1,1))

for c in numeric_test_columns:
    test[c]=Numeric_Imputer.fit_transform(test[c].values.reshape(-1,1))    
     

In [8]:
print("train_shape: "+str(train.shape))
print("test_shape: "+str(test.shape))

train_shape: (1460, 78)
test_shape: (1459, 77)


In [9]:
Object_columns_train = train.select_dtypes(exclude=['number']).columns
Object_columns_test  = test.select_dtypes(exclude=['number']).columns

Ohot=OneHotEncoder(sparse_output=False)

train_encoded=Ohot.fit_transform(train[Object_columns_train])
test_encoded=Ohot.transform(test[Object_columns_test])

ohot_train_df=pd.DataFrame(train_encoded,columns=Ohot.get_feature_names_out(Object_columns_train))
ohot_test_df=pd.DataFrame(test_encoded,columns=Ohot.get_feature_names_out(Object_columns_test))

train=pd.concat([train,ohot_train_df],axis=1)
test=pd.concat([test,ohot_test_df],axis=1)

train.drop(Object_columns_train,axis=1,inplace=True)
test.drop(Object_columns_test,axis=1,inplace=True)


In [10]:
print("train_shape: "+str(train.shape))
print("test_shape: "+str(test.shape))

train_shape: (1460, 281)
test_shape: (1459, 280)


In [11]:
train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1.0,60.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2.0,20.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3.0,60.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4.0,70.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5.0,60.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461.0,20.0,80.0,11622.0,5.0,6.0,1961.0,1961.0,0.0,468.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1462.0,20.0,81.0,14267.0,6.0,6.0,1958.0,1958.0,108.0,923.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1463.0,60.0,74.0,13830.0,5.0,5.0,1997.0,1998.0,0.0,791.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1464.0,60.0,78.0,9978.0,6.0,6.0,1998.0,1998.0,20.0,602.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1465.0,120.0,43.0,5005.0,8.0,5.0,1992.0,1992.0,0.0,263.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## normalize

In [13]:
scaler=MaxAbsScaler()

X=train.drop(columns='SalePrice')
Y=train['SalePrice']


xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2)


xtrain=scaler.fit_transform(xtrain)
xtest=scaler.transform(xtest)


## validation

In [14]:



model=GradientBoostingRegressor(
    loss='squared_error',n_estimators=500,max_depth=128,min_samples_split=2,min_samples_leaf=2,random_state=40
    )





In [15]:

def calculate_r2(x_train, y_train, x_test, y_test, model):
    # Fit the model
    model.fit(x_train, y_train)
    
    # Predictions
    predict_trainYn = model.predict(x_train)
    predict_testYn = model.predict(x_test)
    
    # Calculate R2 scores
    r_train = r2_score(y_train, predict_trainYn)
    r_test = r2_score(y_test, predict_testYn)
    
    return r_train, r_test


# Define the number of threads
num_threads = 2  # depend on cpu cores

# Create ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit the tasks to the ThreadPoolExecutor
    future_to_r2 = {executor.submit(calculate_r2, xtrain, ytrain, xtest, ytest, model): model for _ in range(num_threads)}

    # Get the results
    for future in concurrent.futures.as_completed(future_to_r2):
        model = future_to_r2[future]
        try:
            r_train, r_test = future.result()
            print(f"r2_score for train: "+str(r_train))
            print(f"r2_score for test data: "+str(r_test)) 
        except Exception as e:
            print(f"An error occurred: {e}")



An error occurred: 'NoneType' object has no attribute '_validate_X_predict'


r2_score for train: 1.0
r2_score for test data: 0.8737665080918413


## predict

In [16]:


def predict_and_evaluate(X, Y, test, model):
    # Fit the model
    model.fit(X, Y)
    
    # Predictions
    predict_trainY = model.predict(X)
    predict_testY = model.predict(test)
    
    # Calculate R2 score and RMSE
    r2 = r2_score(Y, predict_trainY)
    rmse = sqrt(mean_squared_error(Y, predict_trainY))
    
    return predict_testY, r2, rmse


# Define the number of threads
num_threads = 2  # depend on cpu cores

# Create ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit the task to the ThreadPoolExecutor
    future_to_result = {executor.submit(predict_and_evaluate, X, Y, test, model): model for _ in range(num_threads)}

    # Get the results
    for future in concurrent.futures.as_completed(future_to_result):
        model = future_to_result[future]
        try:
            predict_testY, r2, rmse = future.result()
            print(f"R2 score: {r2}")
            print(f"RMSE: {rmse}")
            
            # Write results to DataFrame
            result_dataFrame = pd.DataFrame({
                'Id': test['Id'],
                'SalePrice': predict_testY,
            })
            
            # Write DataFrame to CSV file
            result_dataFrame.to_csv('result.csv', index=False)
            
        except Exception as e:
            print(f"An error occurred: {e}")


An error occurred: 'NoneType' object has no attribute '_validate_X_predict'
R2 score: 1.0
RMSE: 8.622383788232688e-05
