In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures,OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import roc_curve, roc_auc_score, f1_score

In [2]:
train_df = pd.read_csv('/content/train.csv')

# Exploratory data analysis
exploring_data = train_df.dtypes[train_df.dtypes != 'object']

def scatterplots(train_df):
    for column in train_df.columns[train_df.dtypes != 'object']:
        sns.scatterplot(x=column, y='SalePrice', data=train_df)
        plt.title(f'Scatter plot of {column} vs SalePrice')
        plt.show()

# Filter data based on specific conditions
train_df.query('OverallQual==10')
train_df.query('YearBuilt<1900')
train_df.query('LowQualFinSF>550')
train_df.query('GrLivArea>4000')
train_df.query('BsmtFullBath>2.5')
train_df.query('BsmtHalfBath>1.75')
train_df.query('BedroomAbvGr>7')
train_df.query('TotRmsAbvGrd==14')
train_df.query('Fireplaces==3')
train_df.query('GarageArea>1200')
train_df.query('WoodDeckSF>650')
train_df.query('OpenPorchSF>400')
train_df.query('EnclosedPorch>400')
train_df.query('MiscVal>3000')
train_df.query('YrSold==2007.0 & SalePrice>600000')

values = [106, 185, 304, 583, 630, 716, 747, 809, 945, 991, 1132, 1137, 1292, 1349, 1416, 88, 250, 375, 378, 398, 676, 58, 70, 115, 297, 691, 808, 1169, 1289, 322, 1298, 224, 278, 477, 581, 678, 1267, 106, 185, 304, 583, 630, 716, 747, 809, 945, 991, 1132, 1137, 1292, 1349, 1416, 185, 523, 619, 1182, 1298, 738, 597, 954, 635, 166, 309, 605, 642, 1298, 581, 825, 1061, 1190, 1298, 55, 335, 1068, 1313, 1459, 495, 523, 583, 854, 1328, 197, 346, 705, 1230, 691, 1182]
train_df = train_df[~train_df.Id.isin(values)]

In [3]:
# Clean dataset
train_df.columns = train_df.columns.str.strip()
train_df = train_df.drop(['Id', 'KitchenAbvGr'], axis=1)

# Separate numeric and object features
numeric_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
object_features = train_df.select_dtypes(include=['object']).columns.tolist()

In [4]:
y = train_df['SalePrice']
X = train_df.drop(['SalePrice'], axis=1)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Specify columns for transformation
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns= X.select_dtypes(include=['object']).columns.tolist()
categorical_transformer= Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Pipeline for numeric features to handle missing values and object/string features
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),('scaler', MinMaxScaler()),])
# Column Transformer
trf1 = make_column_transformer((numeric_transformer, numerical_columns),(categorical_transformer,categorical_columns),remainder='passthrough')

# Create the pipeline
pipe = make_pipeline(trf1,LinearRegression())

# Fit and predict
pred= pipe.fit(X_train, y_train).predict(X_test)
result = r2_score(y_test, pred)
result

0.8358551343939287