In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing Training Data - Handling Missing Values

In [2]:
df_train = pd.read_csv('train.csv')

In [3]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df_train.shape

(1460, 81)

In [5]:
Total_train = df_train.isnull().sum().sort_values(ascending= False)
missing_train_data = pd.concat([Total_train], axis = 1, keys = ['Total'])
missing_train_data.head(20)

Unnamed: 0,Total
PoolQC,1453
MiscFeature,1406
Alley,1369
Fence,1179
FireplaceQu,690
LotFrontage,259
GarageCond,81
GarageType,81
GarageYrBlt,81
GarageFinish,81


In [6]:
df_train.drop((missing_train_data[missing_train_data['Total'] > 100]).index, axis = 1, inplace = True)
df_train.isnull().sum()

Id               0
MSSubClass       0
MSZoning         0
LotArea          0
Street           0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 75, dtype: int64

In [7]:
df_train.drop(['Id'], axis = 1, inplace = True)

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotArea        1460 non-null   int64  
 3   Street         1460 non-null   object 
 4   LotShape       1460 non-null   object 
 5   LandContour    1460 non-null   object 
 6   Utilities      1460 non-null   object 
 7   LotConfig      1460 non-null   object 
 8   LandSlope      1460 non-null   object 
 9   Neighborhood   1460 non-null   object 
 10  Condition1     1460 non-null   object 
 11  Condition2     1460 non-null   object 
 12  BldgType       1460 non-null   object 
 13  HouseStyle     1460 non-null   object 
 14  OverallQual    1460 non-null   int64  
 15  OverallCond    1460 non-null   int64  
 16  YearBuilt      1460 non-null   int64  
 17  YearRemodAdd   1460 non-null   int64  
 18  RoofStyl

In [9]:
numeric_missed_train = ['MasVnrArea',
                  'GarageYrBlt']
for feature in numeric_missed_train:
    df_train[feature] = df_train[feature].fillna(0)

In [10]:
categorical_missed_train = ['MasVnrType',
                      'BsmtQual',
                      'BsmtCond',
                      'BsmtExposure',
                      'BsmtFinType1',
                      'BsmtFinType2',
                      'Electrical',
                      'GarageType',
                      'GarageFinish',
                      'GarageQual',
                      'GarageCond']
for feature in categorical_missed_train:
    df_train[feature] = df_train[feature].fillna(df_train[feature].mode()[0])

In [11]:
df_train.shape

(1460, 74)

In [12]:
df_train.isnull().sum().max()

0

# Importing Testing Data - Handling Missing Values

In [13]:
df_test = pd.read_csv('test.csv')

In [14]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [15]:
df_test.shape

(1459, 80)

In [16]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [17]:
Total_test = df_test.isnull().sum().sort_values(ascending= False)
missing_data_test = pd.concat([Total_test], axis = 1, keys = ['Total'])
missing_data_test.head(30)

Unnamed: 0,Total
PoolQC,1456
MiscFeature,1408
Alley,1352
Fence,1169
FireplaceQu,730
LotFrontage,227
GarageCond,78
GarageQual,78
GarageYrBlt,78
GarageFinish,78


In [18]:
df_test.drop((missing_data_test[missing_data_test['Total'] > 100]).index, axis = 1, inplace = True)
df_test.isnull().sum().max()

78

In [19]:
df_test.drop(['Id'], axis = 1, inplace = True)

In [20]:
Total_test = df_test.isnull().sum().sort_values(ascending= False)
missing_data_test = pd.concat([Total_test], axis = 1, keys = ['Total'])
missing_data_test.head(30)

Unnamed: 0,Total
GarageQual,78
GarageCond,78
GarageYrBlt,78
GarageFinish,78
GarageType,76
BsmtCond,45
BsmtQual,44
BsmtExposure,44
BsmtFinType1,42
BsmtFinType2,42


In [21]:
numeric_missed_test = ['MasVnrArea',
                       'BsmtHalfBath',
                       'BsmtFullBath',
                       'GarageCars',
                       'GarageArea',
                       'BsmtFinSF2',
                       'BsmtFinSF1']

for feature in numeric_missed_test:
    df_test[feature] = df_test[feature].fillna(0)

In [22]:
numeric_mean_test = ['GarageYrBlt',
                    'TotalBsmtSF',
                    'BsmtUnfSF']

for feature in numeric_mean_test:
    df_test[feature] = df_test[feature].fillna(df_test[feature].mean())

In [23]:
categorical_missed_test = ['GarageFinish',
                           'GarageCond',
                           'GarageQual',
                           'GarageType',
                           'BsmtCond',
                           'BsmtQual',
                           'BsmtExposure',
                           'BsmtFinType1',
                           'BsmtFinType2',
                           'MasVnrType',
                           'MSZoning',
                           'Utilities',
                           'Functional',
                           'SaleType',
                           'KitchenQual',
                           'Exterior1st',
                           'Exterior2nd']
                           
    
for feature in categorical_missed_test:
    df_test[feature] = df_test[feature].fillna(df_test[feature].mode()[0])

In [24]:
Total_test = df_test.isnull().sum().sort_values(ascending= False)
missing_data_test = pd.concat([Total_test], axis = 1, keys = ['Total'])
missing_data_test.head(30)

Unnamed: 0,Total
SaleCondition,0
TotalBsmtSF,0
RoofMatl,0
Exterior1st,0
Exterior2nd,0
MasVnrType,0
MasVnrArea,0
ExterQual,0
ExterCond,0
Foundation,0


In [25]:
df_train.shape

(1460, 74)

In [26]:
df_test.shape

(1459, 73)

In [27]:
df_all = pd.concat([df_train, df_test], axis = 0)

In [28]:
df_all.shape

(2919, 74)

In [29]:
df_all

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,2,2008,WD,Normal,208500.0
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,5,2007,WD,Normal,181500.0
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,9,2008,WD,Normal,223500.0
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,272,0,0,0,0,2,2006,WD,Abnorml,140000.0
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0,0,0,0,0,12,2008,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,1936,Pave,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,...,0,0,0,0,0,6,2006,WD,Normal,
1455,160,RM,1894,Pave,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,...,0,0,0,0,0,4,2006,WD,Abnorml,
1456,20,RL,20000,Pave,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,...,0,0,0,0,0,9,2006,WD,Abnorml,
1457,85,RL,10441,Pave,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,...,0,0,0,0,700,7,2006,WD,Normal,


In [30]:
df_all = pd.get_dummies(df_all)

In [31]:
df_all.shape

(2919, 270)

In [32]:
df_Train = df_all.iloc[:1460, :]
df_Test = df_all.iloc[1460:, :]

In [33]:
df_Train.shape

(1460, 270)

In [34]:
df_Test.shape

(1459, 270)

In [35]:
df_Test.drop(['SalePrice'], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [36]:
df_Test.shape

(1459, 269)

In [37]:
X_train = df_Train.drop(['SalePrice'], axis = 1)
y_train = df_Train['SalePrice']

In [38]:
import xgboost
classifier = xgboost.XGBRegressor()
classifier.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [39]:
y_pred = classifier.predict(df_Test)

In [40]:
y_pred

array([118801.09 , 162736.73 , 174367.33 , ..., 155173.5  , 125623.734,
       234818.39 ], dtype=float32)

In [41]:
pred = pd.DataFrame(y_pred)
sub_df = pd.read_csv('sample_submission.csv')
datasets = pd.concat([sub_df['Id'], pred], axis = 1)
datasets.columns = ['Id', 'SalePrice']
datasets.to_csv('submission.csv', index = False)