In [42]:
import pandas as pd
import numpy as np
import seaborn as sns


In [43]:
# https://www.kaggle.com/competitions/home-data-for-ml-course/data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
def showCol():
    print("train df:", )
    print(train_df.keys())
    print('-'.center(10, '-'))
    print('test df:')
    print(test_df.keys())

def dataType():
    print("train df:", )
    type_counts = {
        'int64' : 0,
        'float64' : 0,
        'object' : 0
    }

    a = train_df.dtypes
    for dtype in a:
            if np.issubdtype(dtype, np.integer):
                type_counts['int64'] += 1
            elif np.issubdtype(dtype, np.floating):
                type_counts['float64'] += 1
            elif np.issubdtype(dtype, np.object_):
                type_counts['object'] += 1

    for key, val in type_counts.items():
        print(f'{key} : {val}')
    print('Total:', sum(type_counts.values()))

    print('-'.center(30, '-'))
    
    print("test df:", )
    type_counts = {
        'int64' : 0,
        'float64' : 0,
        'object' : 0
    }

    a = test_df.dtypes
    for dtype in a:
            if np.issubdtype(dtype, np.integer):
                type_counts['int64'] += 1
            elif np.issubdtype(dtype, np.floating):
                type_counts['float64'] += 1
            elif np.issubdtype(dtype, np.object_):
                type_counts['object'] += 1

    for key, val in type_counts.items():
        print(f'{key} : {val}')
    print('Total:', sum(type_counts.values()))


# showCol()
dataType()

In [None]:

train_df.describe()

In [None]:
test_df.describe()

In [None]:
def checkNA():
    x = train_df.columns[train_df.isnull().any()].tolist()
    print('train df :', len(x))
    print(x)
    print()
    x = test_df.columns[test_df.isnull().any()].tolist()
    print('test :', len(x))
    print(x)
checkNA()

In [49]:
from sklearn.preprocessing import LabelEncoder


# Iterate through each column in the DataFrame
for col in train_df.columns:
    if train_df[col].dtype == 'object':  # Check if the column is categorical
        if train_df[col].isnull().any():  # Check if the column has NaN values
            temp_encoder = LabelEncoder()
            temp_encoder.fit(train_df[col].dropna())  # Fit only on non-NaN values
            temp_median = pd.Series(temp_encoder.transform(train_df[col].dropna())).median()
            # Replace NaN with the median label
            train_df[col] = train_df[col].fillna(temp_encoder.classes_[int(temp_median)])
        
        # Final encoding of the column
        encoder = LabelEncoder()
        train_df[col] = encoder.fit_transform(train_df[col])
    else:  # If the column is numeric
        if train_df[col].isnull().any():
            train_df[col] = train_df[col].fillna(train_df[col].median())  # Replace NaN with median



for col in test_df.columns:
    if test_df[col].dtype == 'object':  # Check if the column is categorical
        if test_df[col].isnull().any():  # Check if the column has NaN values
            temp_encoder = LabelEncoder()
            temp_encoder.fit(test_df[col].dropna())  # Fit only on non-NaN values
            temp_median = pd.Series(temp_encoder.transform(test_df[col].dropna())).median()
            # Replace NaN with the median label
            test_df[col] = test_df[col].fillna(temp_encoder.classes_[int(temp_median)])
        
        # Final encoding of the column
        encoder = LabelEncoder()
        test_df[col] = encoder.fit_transform(test_df[col])
    else:  # If the column is numeric
        if test_df[col].isnull().any():
            test_df[col] = test_df[col].fillna(test_df[col].median())  # Replace NaN with median

In [None]:
checkNA()

In [None]:
dataType()

In [52]:
x_train = train_df.copy()
x_test = test_df.copy()

In [53]:
y_train = x_train['SalePrice']
x_train = x_train.drop(labels=['Id', 'SalePrice'], axis=1)

In [54]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2)

In [55]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [None]:
# random forest regressor
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)
prediction_rfr = rfr.predict(x_valid)
r2_score_rfr = r2_score(y_valid, prediction_rfr)
print(r2_score_rfr)

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)
prediction_lr = lr.predict(x_valid)
r2_score_lr = r2_score(y_valid, prediction_lr)
print(r2_score_lr)

In [None]:
rid = Ridge()
rid.fit(x_train, y_train)
prediction_rid = rid.predict(x_valid)
r2_score_rid = r2_score(y_valid, prediction_rid)
print(r2_score_rid)

In [None]:
prediction_lasso = Lasso().fit(x_train, y_train).predict(x_valid)
r2_score_lasso = r2_score(y_valid, prediction_lasso)
print(r2_score_lasso)

In [None]:
prediction_kn = KNeighborsRegressor().fit(x_train, y_train).predict(x_valid)
r2_score_kn = r2_score(y_valid, prediction_kn)
print(r2_score_kn)

In [None]:
prediction_dt = DecisionTreeRegressor().fit(x_train, y_train).predict(x_valid)
r2_score_dt = r2_score(y_valid, prediction_dt)
print(r2_score_dt)

In [None]:
prediction_svr = SVR().fit(x_train, y_train).predict(x_valid)
r2_score_svr = r2_score(y_valid, prediction_svr)
print(r2_score_svr)

In [None]:
from pandas import DataFrame


models_performance = DataFrame(
    {
        'Model': ['RandomForestRegressor', 'LinearRegression', 'Ridge', 'Lasso', 'KNeighborsRegressor', 'DecisionTreeRegressor', 'SVR'],
        'R2 Score': [r2_score_rfr, r2_score_lr, r2_score_rid, r2_score_lasso, r2_score_kn, r2_score_dt, r2_score_svr]
    }
)
models_performance = models_performance.sort_values(by='R2 Score', ascending=False)
print(models_performance)

In [64]:
final_prediction = rfr.predict(x_test.drop('Id', axis=1))

In [67]:
submission_df = DataFrame(
    {
        'Id' : x_test['Id'],
        'SalePrice' : final_prediction
    }
)

In [68]:
submission_df.to_csv('submission.csv', index=False)