In [None]:
import seaborn as sns
import pandas as pd

In [None]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [None]:
train_df.shape

In [None]:
train_df.head(3)

In [None]:
test_df.shape

In [None]:
test_df.head(3)

In [None]:
train = train_df.copy()
X = train.drop(columns="Id", axis= 1)
test = test_df.copy()
test = test.drop(columns="Id", axis= 1)

In [None]:
y = X.pop("SalePrice")
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Identify the numerical and categorical columns
X_num = X.select_dtypes(include="number").copy().columns
X_cat = X.select_dtypes(exclude="number").copy().columns

# Create transformers for numerical and categorical features
num_transformer = make_pipeline(
    SimpleImputer(strategy="mean"))

cat_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore"))

# Create a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, X_num),
         #name  #pipeline       # list of column label
        ('cat', cat_transformer, X_cat)
    ])

In [None]:
preprocessor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model = RandomForestRegressor()

In [None]:
pipeline = make_pipeline(preprocessor,model)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
predition_test = pipeline.predict(X= X_test)
#because the model fit the train data, we should use the test data to test how well 
#the model preformed
predition_test

In [None]:
from sklearn.metrics import r2_score

In [None]:
performances = r2_score(y_true=y_test, y_pred=predition_test)
performances

In [None]:
## neg_mean_squared_log_error

`1st`

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

# Identify the numerical and categorical columns
X_num = X.select_dtypes(include="number").copy().columns
X_cat = X.select_dtypes(exclude="number").copy().columns

# Create transformers for numerical and categorical features
num_transformer = make_pipeline(
    SimpleImputer(strategy="mean"),
    MinMaxScaler())

cat_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore"))

# Create a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, X_num),
        ('cat', cat_transformer, X_cat)
    ])

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import SelectKBest, f_regression

# Decision tree.
tree = make_pipeline(preprocessor,
                     VarianceThreshold(),
                     SelectKBest(score_func=f_regression, k=40),
                     SelectFromModel(DecisionTreeRegressor()),
                     DecisionTreeRegressor())

tree.fit(X_train, y_train)
tree_pred = tree.predict(X = test)


In [None]:
prediction_test = pipeline.predict(X_test)
performance = r2_score(y_test, prediction_test)
performance

In [None]:
test_1 = test_df.copy()
test_1["SalePrice"] = tree_pred

In [None]:
submission = test_1[["Id", "SalePrice"]]
submission

In [None]:
submission.to_csv("submission_1.csv", index=False)

`2nd`

In [None]:
train = train_df.copy()
X = train.drop(columns="Id", axis= 1)
y = X.pop("SalePrice")


# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Identify the numerical and categorical columns
X_num = X.select_dtypes(include="number").copy().columns
X_cat = X.select_dtypes(exclude="number").copy().columns

# Create transformers for numerical and categorical features
num_transformer = make_pipeline(
    SimpleImputer(strategy="mean"))

cat_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore"))

# Create a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, X_num),
        ('cat', cat_transformer, X_cat)
    ])

# Decision Tree.
tree = make_pipeline(preprocessor,
                     SelectKBest(score_func=f_regression, k=40),
                     SelectFromModel(DecisionTreeRegressor()),
                     DecisionTreeRegressor())

tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

performance = r2_score(y_test, tree_pred)
performance

In [None]:
#preparing validation dataset
test = test_df.copy()
test = test.drop(columns="Id", axis= 1)

#prediction
tree_pred = tree.predict(test)

#prepareing for submission
test_2 = test_df.copy()
test_2["SalePrice"] = tree_pred
submission = test_2[["Id", "SalePrice"]]
submission.to_csv("submission_2.csv", index=False)

In [None]:
submission

`3rd`

In [None]:
train = train_df.copy()
X = train.drop(columns="Id", axis= 1)
y = X.pop("SalePrice")


# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Identify the numerical and categorical columns
X_num = X.select_dtypes(include="number").copy().columns
X_cat = X.select_dtypes(exclude="number").copy().columns

# Create transformers for numerical and categorical features
num_transformer = make_pipeline(
    SimpleImputer(strategy="mean"),
    MinMaxScaler())

cat_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore"))

# Create a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, X_num),
        ('cat', cat_transformer, X_cat)
    ])

# Decision Tree.
tree = make_pipeline(preprocessor,
                     SelectFromModel(DecisionTreeRegressor()),
                     DecisionTreeRegressor())

tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

performance = r2_score(y_test, tree_pred)
performance

In [None]:
from sklearn.metrics import mean_squared_error

train_rmse = mean_squared_error(y_train, tree.predict(X_train), squared=False)
test_rmse = mean_squared_error(y_test, tree.predict(X_test), squared=False)
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")

In [None]:
#preparing validation dataset
test = test_df.copy()
test = test.drop(columns="Id", axis= 1)

#prediction
tree_pred = tree.predict(test)

#prepareing for submission
test_3 = test_df.copy()
test_3["SalePrice"] = tree_pred
submission = test_3[["Id", "SalePrice"]]
submission.to_csv("submission_3.csv", index=False)

submission

`4th`

In [None]:
train = train_df.copy()
X = train.drop(columns="Id", axis= 1)
y = X.pop("SalePrice")


# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Identify the numerical and categorical columns
X_num = X.select_dtypes(include="number").copy().columns
X_cat = X.select_dtypes(exclude="number").copy().columns

# Create transformers for numerical and categorical features
num_transformer = make_pipeline(
    SimpleImputer(strategy="mean"),
    MinMaxScaler())

cat_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore"))

# Create a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, X_num),
        ('cat', cat_transformer, X_cat)
    ])

# Decision Tree.
tree = make_pipeline(preprocessor,
                     SelectFromModel(RandomForestRegressor()),
                     DecisionTreeRegressor())

tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

# R^2 score
performance = r2_score(y_test, tree_pred)
print(performance)


#RMSE
train_rmse = mean_squared_error(y_train, tree.predict(X_train), squared=False)
test_rmse = mean_squared_error(y_test, tree.predict(X_test), squared=False)
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")

In [None]:
#preparing validation dataset
test = test_df.copy()
test = test.drop(columns="Id", axis= 1)

#prediction
tree_pred = tree.predict(test)

#prepareing for submission
test_4 = test_df.copy()
test_4["SalePrice"] = tree_pred
submission = test_4[["Id", "SalePrice"]]
submission.to_csv("submission_4.csv", index=False)

submission

`5th`

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
train = train_df.copy()
X = train.drop(columns="Id", axis= 1)
y = X.pop("SalePrice")


# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Identify the numerical and categorical columns
X_num = X.select_dtypes(include="number").copy().columns
X_cat = X.select_dtypes(exclude="number").copy().columns

# Create transformers for numerical and categorical features
num_transformer = make_pipeline(
    SimpleImputer(strategy="mean"),
    MinMaxScaler())

cat_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore"))

# Create a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, X_num),
        ('cat', cat_transformer, X_cat)
    ])

# Decision Tree.
tree = make_pipeline(preprocessor,
                     SelectFromModel(RandomForestRegressor()),
                     GradientBoostingRegressor())

tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

# R^2 score
performance = r2_score(y_test, tree_pred)
print(performance)


#RMSE
train_rmse = mean_squared_error(y_train, tree.predict(X_train), squared=False)
test_rmse = mean_squared_error(y_test, tree.predict(X_test), squared=False)
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")

In [None]:
#preparing validation dataset
test = test_df.copy()
test = test.drop(columns="Id", axis= 1)

#prediction
tree_pred = tree.predict(test)

#prepareing for submission
test_5 = test_df.copy()
test_5["SalePrice"] = tree_pred
submission = test_5[["Id", "SalePrice"]]
submission.to_csv("submission_5.csv", index=False)

submission

`6th`

In [None]:
train = train_df.copy()
X = train.drop(columns="Id", axis= 1)
y = X.pop("SalePrice")


# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Identify the numerical and categorical columns
X_num = X.select_dtypes(include="number").copy().columns
X_cat = X.select_dtypes(exclude="number").copy().columns

# Create transformers for numerical and categorical features
num_transformer = make_pipeline(
    SimpleImputer(strategy="mean"),
    MinMaxScaler())

cat_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore"))

# Create a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, X_num),
        ('cat', cat_transformer, X_cat)
    ])

# Decision Tree.
tree = make_pipeline(preprocessor,
                     SelectFromModel(GradientBoostingRegressor()),
                     RandomForestRegressor())

tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

# R^2 score
performance = r2_score(y_test, tree_pred)
print(performance)


#RMSE
train_rmse = mean_squared_error(y_train, tree.predict(X_train), squared=False)
test_rmse = mean_squared_error(y_test, tree.predict(X_test), squared=False)
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")

In [None]:
#preparing validation dataset
test = test_df.copy()
test = test.drop(columns="Id", axis= 1)

#prediction
tree_pred = tree.predict(test)

#prepareing for submission
test_6 = test_df.copy()
test_6["SalePrice"] = tree_pred
submission = test_6[["Id", "SalePrice"]]
submission.to_csv("submission_6.csv", index=False)

submission

`7th`

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
train = train_df.copy()
X = train.drop(columns="Id", axis= 1)
y = X.pop("SalePrice")


# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Identify the numerical and categorical columns
X_num = X.select_dtypes(include="number").copy().columns
X_cat = X.select_dtypes(exclude="number").copy().columns

# Create transformers for numerical features
num_transformer = make_pipeline(
    SimpleImputer(strategy="mean"),
    MinMaxScaler())

# Create transformers for categorical features
## defining ordinal & onehot columns
ordinal_cols = X_cat.columns.get_indexer(["Cabin"])
onehot_cols = X_cat.columns.get_indexer(["Sex", "Embarked"])

cat_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore"))

# Create a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, X_num),
        ('cat', cat_transformer, X_cat)
    ])

# Decision Tree.
tree = make_pipeline(preprocessor,
                     VarianceThreshold(),
                     #SelectKBest(score_func=f_regression, k=40),
                     SelectFromModel(GradientBoostingRegressor()),
                     RandomForestRegressor())

tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

# R^2 score
performance = r2_score(y_test, tree_pred)
print(performance)


#RMSE
train_rmse = mean_squared_error(y_train, tree.predict(X_train), squared=False)
test_rmse = mean_squared_error(y_test, tree.predict(X_test), squared=False)
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")