In [96]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the data
data = pd.read_csv('train.csv', usecols=[
    'LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','FullBath','BedroomAbvGr','TotRmsAbvGrd','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','SalePrice'
])

In [97]:
data.shape

(1460, 20)

In [98]:
# Count 0 values in each column
zero_counts_per_column_train = data.apply(lambda col: (col == 0).sum())
print(zero_counts_per_column_train)

LotArea             0
MasVnrArea        861
BsmtFinSF1        467
BsmtFinSF2       1293
BsmtUnfSF         118
TotalBsmtSF        37
1stFlrSF            0
2ndFlrSF          829
LowQualFinSF     1434
GrLivArea           0
FullBath            9
BedroomAbvGr        6
TotRmsAbvGrd        0
GarageArea         81
WoodDeckSF        761
OpenPorchSF       656
EnclosedPorch    1252
3SsnPorch        1436
ScreenPorch      1344
SalePrice           0
dtype: int64


In [99]:
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [101]:
data.fillna(0, inplace=True)

In [102]:
data.isnull().sum()

LotArea          0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
FullBath         0
BedroomAbvGr     0
TotRmsAbvGrd     0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
SalePrice        0
dtype: int64

In [103]:
data = data.drop_duplicates()

In [104]:
# Handle missing values more effectively
data.fillna(data.mean(), inplace=True)

In [105]:
# Create a pipeline with an imputer and a linear regression model
pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('model', LinearRegression())
])

In [106]:
# Fit the model
pipeline.fit(X_train, y_train)

In [107]:
y_pred = pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'R-squared score: {r2:.2f}')

R-squared score: 0.78


In [108]:
import joblib

In [109]:
joblib.dump(pipeline, 'practice2.pkl')

['practice2.pkl']

In [110]:
test_data = pd.read_csv('test.csv')

In [111]:
test_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [112]:
# Select features and target
test_data = test_data[['LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','FullBath','BedroomAbvGr','TotRmsAbvGrd','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch']]

In [113]:
test_data

Unnamed: 0,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,FullBath,BedroomAbvGr,TotRmsAbvGrd,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch
0,11622,0.0,468.0,144.0,270.0,882.0,896,0,0,896,1,2,5,730.0,140,0,0,0,120
1,14267,108.0,923.0,0.0,406.0,1329.0,1329,0,0,1329,1,3,6,312.0,393,36,0,0,0
2,13830,0.0,791.0,0.0,137.0,928.0,928,701,0,1629,2,3,6,482.0,212,34,0,0,0
3,9978,20.0,602.0,0.0,324.0,926.0,926,678,0,1604,2,3,7,470.0,360,36,0,0,0
4,5005,0.0,263.0,0.0,1017.0,1280.0,1280,0,0,1280,2,2,5,506.0,0,82,0,0,144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1936,0.0,0.0,0.0,546.0,546.0,546,546,0,1092,1,3,5,0.0,0,0,0,0,0
1455,1894,0.0,252.0,0.0,294.0,546.0,546,546,0,1092,1,3,6,286.0,0,24,0,0,0
1456,20000,0.0,1224.0,0.0,0.0,1224.0,1224,0,0,1224,1,4,7,576.0,474,0,0,0,0
1457,10441,0.0,337.0,0.0,575.0,912.0,970,0,0,970,1,3,6,0.0,80,32,0,0,0


In [114]:
# Count 0 values in each column
zero_counts_per_column = test_data.apply(lambda col: (col == 0).sum())
print(zero_counts_per_column)

LotArea             0
MasVnrArea        877
BsmtFinSF1        462
BsmtFinSF2       1278
BsmtUnfSF         123
TotalBsmtSF        41
1stFlrSF            0
2ndFlrSF          839
LowQualFinSF     1445
GrLivArea           0
FullBath            3
BedroomAbvGr        2
TotRmsAbvGrd        0
GarageArea         76
WoodDeckSF        762
OpenPorchSF       642
EnclosedPorch    1208
3SsnPorch        1446
ScreenPorch      1319
dtype: int64


In [117]:
test_data.isnull().sum()

LotArea          0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
FullBath         0
BedroomAbvGr     0
TotRmsAbvGrd     0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
dtype: int64

In [118]:
# Handle missing values more effectively
test_data.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.fillna(0, inplace=True)


In [119]:
test_data.duplicated().sum()

0

In [120]:
# Load the saved model
best_model = joblib.load('practice2.pkl')

In [121]:
# Make predictions on the new test data
new_predictions = best_model.predict(test_data)

In [122]:
# Print the predictions
print(new_predictions)

[163495.78936819 169642.02237315 195063.33628145 ... 172585.38365167
  90588.12066335 247613.15504939]


In [123]:
new_predictions

array([163495.78936819, 169642.02237315, 195063.33628145, ...,
       172585.38365167,  90588.12066335, 247613.15504939])

In [124]:
# Create a DataFrame with the original features and the predicted prices
predictions_df = test_data[['LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','FullBath','BedroomAbvGr','TotRmsAbvGrd','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch']].copy()
predictions_df['predicted_price'] = new_predictions

In [126]:
# Save the DataFrame to a CSV file
predictions_df.to_csv('predicted_price_task_1.csv', index=False)