# Ames Housing Data Analysis (Test)

# Problem Statement

Dataset contains information from the Ames Assessor’s Office used in computing assessed values for individual residential properties sold in Ames, IA from 2006 to 2010.

Using this dataset, we build a regression model to predict the selling prices of houses. For each Id in the test set, i will predict the value of the Sale Price variable.

In [1]:
# Import the necessary libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Set row display option
pd.set_option('display.max_rows' , 100)
# pd.set_option('display.max_columns' , 100)

# modeling imports
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV

# Adding the gridline in graph
sns.set_style('whitegrid')

# Load the Data

In [2]:
test = pd.read_csv('../dataset/test.csv')

In [3]:
# Look at the data
test.head(10)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD
5,333,923228370,160,RM,21.0,1890,Pave,,Reg,Lvl,...,0,0,0,,,,0,6,2010,WD
6,1327,902427150,20,RM,52.0,8516,Pave,,Reg,Lvl,...,0,0,0,,,,0,5,2008,WD
7,858,907202130,20,RL,,9286,Pave,,IR1,Lvl,...,0,0,0,,,,0,10,2009,WD
8,95,533208090,160,FV,39.0,3515,Pave,Pave,Reg,Lvl,...,0,0,0,,,,0,1,2010,WD
9,1568,914476010,20,RL,75.0,10125,Pave,,Reg,Lvl,...,0,0,0,,MnPrv,,0,2,2008,WD


In [4]:
# Look at the data
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               879 non-null    int64  
 1   PID              879 non-null    int64  
 2   MS SubClass      879 non-null    int64  
 3   MS Zoning        879 non-null    object 
 4   Lot Frontage     719 non-null    float64
 5   Lot Area         879 non-null    int64  
 6   Street           879 non-null    object 
 7   Alley            58 non-null     object 
 8   Lot Shape        879 non-null    object 
 9   Land Contour     879 non-null    object 
 10  Utilities        879 non-null    object 
 11  Lot Config       879 non-null    object 
 12  Land Slope       879 non-null    object 
 13  Neighborhood     879 non-null    object 
 14  Condition 1      879 non-null    object 
 15  Condition 2      879 non-null    object 
 16  Bldg Type        879 non-null    object 
 17  House Style     

In [5]:
# Look at the data
test.describe()

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold
count,879.0,879.0,879.0,719.0,879.0,879.0,879.0,879.0,879.0,878.0,...,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0
mean,1445.588168,716505000.0,58.270762,69.630042,10340.920364,6.054608,5.565415,1970.533561,1984.444824,106.982916,...,470.832765,93.560865,47.478953,24.037543,2.594994,14.813424,1.882821,48.443686,6.207053,2007.824801
std,850.717105,188913500.0,42.211389,23.625372,10047.335167,1.374756,1.128422,30.403527,20.454546,188.356829,...,213.070155,121.174306,69.209179,73.212237,24.948416,52.975963,29.899698,549.858353,2.644097,1.327396
min,2.0,526302100.0,20.0,21.0,1477.0,2.0,1.0,1880.0,1950.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,692.5,528486100.0,20.0,59.0,7298.5,5.0,5.0,1954.0,1967.0,0.0,...,323.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,1435.0,535454200.0,50.0,68.0,9453.0,6.0,5.0,1972.0,1992.0,0.0,...,473.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2197.0,907192100.0,70.0,80.0,11606.5,7.0,6.0,2000.0,2003.0,173.5,...,576.0,171.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,2928.0,1007100000.0,190.0,182.0,215245.0,10.0,9.0,2010.0,2010.0,1378.0,...,1488.0,690.0,742.0,1012.0,360.0,576.0,555.0,15500.0,12.0,2010.0


# Data Cleaning

Check for Null Values.

In [6]:
# Check for Null Values
test.isnull().sum()

Id                   0
PID                  0
MS SubClass          0
MS Zoning            0
Lot Frontage       160
Lot Area             0
Street               0
Alley              821
Lot Shape            0
Land Contour         0
Utilities            0
Lot Config           0
Land Slope           0
Neighborhood         0
Condition 1          0
Condition 2          0
Bldg Type            0
House Style          0
Overall Qual         0
Overall Cond         0
Year Built           0
Year Remod/Add       0
Roof Style           0
Roof Matl            0
Exterior 1st         0
Exterior 2nd         0
Mas Vnr Type         1
Mas Vnr Area         1
Exter Qual           0
Exter Cond           0
Foundation           0
Bsmt Qual           25
Bsmt Cond           25
Bsmt Exposure       25
BsmtFin Type 1      25
BsmtFin SF 1         0
BsmtFin Type 2      25
BsmtFin SF 2         0
Bsmt Unf SF          0
Total Bsmt SF        0
Heating              0
Heating QC           0
Central Air          0
Electrical 

We will apply the similar techniques as our train datasets, we will drop the following variables: `Alley`, `Fireplace Qu`, `Pool QC`, `Fence` & `Misc Feature`

In [7]:
# Initial data size
test.shape

(879, 80)

In [8]:
# Dropping the 5 columns
test.drop(columns=['Alley' , 'Fireplace Qu' , 'Pool QC' , 'Fence' , 'Misc Feature'],axis=1,inplace=True)

In [9]:
# New data size
test.shape

(879, 75)

In [10]:
# Check for nulls after dropping 5 columns
test.isnull().sum()

Id                   0
PID                  0
MS SubClass          0
MS Zoning            0
Lot Frontage       160
Lot Area             0
Street               0
Lot Shape            0
Land Contour         0
Utilities            0
Lot Config           0
Land Slope           0
Neighborhood         0
Condition 1          0
Condition 2          0
Bldg Type            0
House Style          0
Overall Qual         0
Overall Cond         0
Year Built           0
Year Remod/Add       0
Roof Style           0
Roof Matl            0
Exterior 1st         0
Exterior 2nd         0
Mas Vnr Type         1
Mas Vnr Area         1
Exter Qual           0
Exter Cond           0
Foundation           0
Bsmt Qual           25
Bsmt Cond           25
Bsmt Exposure       25
BsmtFin Type 1      25
BsmtFin SF 1         0
BsmtFin Type 2      25
BsmtFin SF 2         0
Bsmt Unf SF          0
Total Bsmt SF        0
Heating              0
Heating QC           0
Central Air          0
Electrical           1
1st Flr SF 

Filling up other missing values, replace those missing values after checking it.

Handling of `Lot Frontage` null data.

In [11]:
# Check the number of elements inside 'Lot Frontage'
test['Lot Frontage'].value_counts(dropna=False)

NaN      160
60.0      97
80.0      43
75.0      37
70.0      37
        ... 
174.0      1
122.0      1
150.0      1
31.0       1
133.0      1
Name: Lot Frontage, Length: 105, dtype: int64

In [12]:
# Replace the missing values with the maximum occurance.
test['Lot Frontage'] = test['Lot Frontage'].fillna(test['Lot Frontage'].mode()[0])
test['Lot Frontage'].value_counts(dropna=False)

60.0     257
80.0      43
70.0      37
75.0      37
50.0      27
        ... 
122.0      1
150.0      1
31.0       1
112.0      1
133.0      1
Name: Lot Frontage, Length: 104, dtype: int64

Handling of `Mas Vnr Type` null data.

In [13]:
# Check the number of elements inside 'Mas Vnr Type'
test['Mas Vnr Type'].value_counts(dropna=False)

None       534
BrkFace    250
Stone       81
BrkCmn      12
CBlock       1
NaN          1
Name: Mas Vnr Type, dtype: int64

In [14]:
# Replace the missing values with None (Most likely the house has no masonry veneer)
test['Mas Vnr Type'] = test['Mas Vnr Type'].fillna('None')
test['Mas Vnr Type'].value_counts(dropna=False)

None       535
BrkFace    250
Stone       81
BrkCmn      12
CBlock       1
Name: Mas Vnr Type, dtype: int64

Handling of `Mas Vnr Area` null data.

In [15]:
# Check the number of elements inside 'Mas Vnr Area'
test['Mas Vnr Area'].value_counts(dropna=False)

0.0      532
216.0      7
80.0       5
196.0      5
420.0      5
        ... 
464.0      1
NaN        1
312.0      1
101.0      1
264.0      1
Name: Mas Vnr Area, Length: 234, dtype: int64

In [16]:
# Replace the missing values with 0 (Most likely the house has no masonry veneer)
test['Mas Vnr Area'] = test['Mas Vnr Area'].fillna(test['Mas Vnr Area'].mode()[0])
test['Mas Vnr Area'].value_counts(dropna=False)

0.0      533
216.0      7
80.0       5
196.0      5
420.0      5
        ... 
464.0      1
312.0      1
101.0      1
238.0      1
264.0      1
Name: Mas Vnr Area, Length: 233, dtype: int64

Handling of `Bsmt Qual` null data

In [17]:
# Check the number of elements inside 'Bsmt Qual'
test['Bsmt Qual'].value_counts(dropna=False)

TA     396
Gd     355
Ex      74
Fa      28
NaN     25
Po       1
Name: Bsmt Qual, dtype: int64

In [18]:
# Replace the missing values with TA (as this is the maximum)
test['Bsmt Qual'] = test['Bsmt Qual'].fillna('TA')
test['Bsmt Qual'].value_counts(dropna=False)

TA    421
Gd    355
Ex     74
Fa     28
Po      1
Name: Bsmt Qual, dtype: int64

Handling of `Bsmt Cond` null data.

In [19]:
# Check the number of elements inside 'Bsmt Cond'
test['Bsmt Cond'].value_counts(dropna=False)

TA     782
Fa      39
Gd      33
NaN     25
Name: Bsmt Cond, dtype: int64

In [20]:
# Replace the missing values with TA (as this is the maximum)
test['Bsmt Cond'] = test['Bsmt Cond'].fillna('TA')
test['Bsmt Cond'].value_counts(dropna=False)

TA    807
Fa     39
Gd     33
Name: Bsmt Cond, dtype: int64

Handling of `Bsmt Exposure` null data.

In [21]:
# Check the number of elements inside 'Bsmt Exposure'
test['Bsmt Exposure'].value_counts(dropna=False)

No     567
Av     130
Gd      81
Mn      76
NaN     25
Name: Bsmt Exposure, dtype: int64

In [22]:
# Replace the missing values with No (as this is the maximum)
test['Bsmt Exposure'] = test['Bsmt Exposure'].fillna(test['Bsmt Exposure'].mode()[0])
test['Bsmt Exposure'].value_counts(dropna=False)

No    592
Av    130
Gd     81
Mn     76
Name: Bsmt Exposure, dtype: int64

Handling of `BsmtFin Type 1` null data.

In [23]:
# Check the number of elements inside 'Bsmt Type 1'
test['BsmtFin Type 1'].value_counts(dropna=False)

Unf    248
GLQ    244
ALQ    136
Rec    105
BLQ     69
LwQ     52
NaN     25
Name: BsmtFin Type 1, dtype: int64

In [24]:
# Replace the missing values with NA (this part is empty most likely because of no basement)
test['BsmtFin Type 1'] = test['BsmtFin Type 1'].fillna('NA')
test['BsmtFin Type 1'].value_counts(dropna=False)

Unf    248
GLQ    244
ALQ    136
Rec    105
BLQ     69
LwQ     52
NA      25
Name: BsmtFin Type 1, dtype: int64

Handling of `BsmtFin Type 2` null data.

In [25]:
# Check the number of elements inside 'BsmtFin Type 2'
test['BsmtFin Type 2'].value_counts(dropna=False)

Unf    750
LwQ     29
Rec     26
NaN     25
BLQ     20
ALQ     18
GLQ     11
Name: BsmtFin Type 2, dtype: int64

In [26]:
# Replace the missing values with NA (this part is empty most likely because of no basement)
test['BsmtFin Type 2'] = test['BsmtFin Type 2'].fillna('NA')
test['BsmtFin Type 2'].value_counts(dropna=False)

Unf    750
LwQ     29
Rec     26
NA      25
BLQ     20
ALQ     18
GLQ     11
Name: BsmtFin Type 2, dtype: int64

Handling of `Electrical` null data.

In [27]:
# Check the number of elements inside 'Electrical'
test['Electrical'].value_counts(dropna=False)

SBrkr    814
FuseA     48
FuseF     15
FuseP      1
NaN        1
Name: Electrical, dtype: int64

In [28]:
# Replace the missing values with SBrkr (as this is the maximum)
test['Electrical'] = test['Electrical'].fillna(test['Electrical'].mode()[0])
test['Electrical'].value_counts(dropna=False)

SBrkr    815
FuseA     48
FuseF     15
FuseP      1
Name: Electrical, dtype: int64

Handling of `Garage Type` null data.

In [29]:
# Check the number of elements inside 'Garage Type'
test['Garage Type'].value_counts(dropna=False)

Attchd     518
Detchd     246
BuiltIn     54
NaN         44
Basment      9
CarPort      4
2Types       4
Name: Garage Type, dtype: int64

In [30]:
# Replace the missing values with Attchd (as this is the maximum)
test['Garage Type'] = test['Garage Type'].fillna('Attchd')
test['Garage Type'].value_counts(dropna=False)

Attchd     562
Detchd     246
BuiltIn     54
Basment      9
CarPort      4
2Types       4
Name: Garage Type, dtype: int64

Handling of `Garage Yr Blt` null data.

In [31]:
# Check the number of elements inside 'Garage Yr Blt'
test['Garage Yr Blt'].value_counts(dropna=False)

NaN       45
2005.0    37
2006.0    35
2007.0    31
2004.0    27
2003.0    26
1977.0    20
1950.0    19
1997.0    18
2008.0    17
1968.0    16
1974.0    16
1993.0    16
1960.0    16
1999.0    15
1976.0    15
1998.0    14
2000.0    14
1980.0    14
1994.0    14
1969.0    14
1957.0    14
2001.0    14
1959.0    13
2002.0    13
1920.0    12
1970.0    12
1963.0    12
2009.0    12
1954.0    12
1961.0    11
1978.0    11
1964.0    10
1962.0    10
1972.0    10
1958.0    10
1966.0    10
1967.0    10
1979.0    10
1956.0    10
1995.0     9
1973.0     9
1996.0     9
1955.0     9
1925.0     9
1984.0     8
1989.0     7
1991.0     7
1930.0     7
1965.0     7
1926.0     7
1952.0     6
1985.0     6
1971.0     6
1988.0     6
1910.0     5
1981.0     5
1990.0     5
1938.0     5
1940.0     5
1939.0     5
1992.0     5
1951.0     5
1948.0     5
1949.0     4
1953.0     4
1983.0     4
1946.0     4
1941.0     4
1924.0     4
1900.0     3
1986.0     3
1915.0     3
2010.0     3
1982.0     3
1975.0     3
1922.0     2

In [32]:
# Replace the missing values with 2005 (as this is the maximum)
test['Garage Yr Blt'] = test['Garage Yr Blt'].fillna(test['Garage Yr Blt'].mode()[0])
test['Garage Yr Blt'].value_counts(dropna=False)

2005.0    82
2006.0    35
2007.0    31
2004.0    27
2003.0    26
1977.0    20
1950.0    19
1997.0    18
2008.0    17
1993.0    16
1974.0    16
1968.0    16
1960.0    16
1976.0    15
1999.0    15
1957.0    14
2001.0    14
1994.0    14
1969.0    14
1998.0    14
1980.0    14
2000.0    14
1959.0    13
2002.0    13
1963.0    12
2009.0    12
1970.0    12
1920.0    12
1954.0    12
1961.0    11
1978.0    11
1967.0    10
1962.0    10
1956.0    10
1979.0    10
1958.0    10
1964.0    10
1972.0    10
1966.0    10
1996.0     9
1995.0     9
1973.0     9
1955.0     9
1925.0     9
1984.0     8
1926.0     7
1965.0     7
1989.0     7
1991.0     7
1930.0     7
1952.0     6
1985.0     6
1971.0     6
1988.0     6
1910.0     5
1981.0     5
1990.0     5
1992.0     5
1938.0     5
1940.0     5
1939.0     5
1951.0     5
1948.0     5
1949.0     4
1953.0     4
1983.0     4
1946.0     4
1941.0     4
1924.0     4
2010.0     3
1975.0     3
1982.0     3
1915.0     3
1986.0     3
1900.0     3
1922.0     2
1947.0     2

Handling of `Garage Finish` null data.

In [33]:
# Check the number of elements inside 'Garage Finish'
test['Garage Finish'].value_counts(dropna=False)

Unf    382
RFn    233
Fin    219
NaN     45
Name: Garage Finish, dtype: int64

In [34]:
# Replace the missing values with Unf (as this is the maximum)
test['Garage Finish'] = test['Garage Finish'].fillna(test['Garage Finish'].mode()[0])
test['Garage Finish'].value_counts(dropna=False)

Unf    427
RFn    233
Fin    219
Name: Garage Finish, dtype: int64

Handling of `Garage Qual` null data.

In [35]:
# Check the number of elements inside 'Garage Qual'
test['Garage Qual'].value_counts(dropna=False)

TA     783
NaN     45
Fa      42
Gd       6
Po       3
Name: Garage Qual, dtype: int64

In [36]:
# Replace the missing values with TA (as this is the maximum)
test['Garage Qual'] = test['Garage Qual'].fillna(test['Garage Qual'].mode()[0])
test['Garage Qual'].value_counts(dropna=False)

TA    828
Fa     42
Gd      6
Po      3
Name: Garage Qual, dtype: int64

Handling of `Garage Cond` null data.

In [37]:
# Check the number of elements inside 'Garage Cond'
test['Garage Cond'].value_counts(dropna=False)

TA     797
NaN     45
Fa      27
Po       6
Gd       3
Ex       1
Name: Garage Cond, dtype: int64

In [38]:
# Replace the missing values with TA (as this is the maximum)
test['Garage Cond'] = test['Garage Cond'].fillna(test['Garage Cond'].mode()[0])
test['Garage Cond'].value_counts(dropna=False)

TA    842
Fa     27
Po      6
Gd      3
Ex      1
Name: Garage Cond, dtype: int64

In [39]:
# Check if there is anymore null values
test.isnull().sum()

Id                 0
PID                0
MS SubClass        0
MS Zoning          0
Lot Frontage       0
Lot Area           0
Street             0
Lot Shape          0
Land Contour       0
Utilities          0
Lot Config         0
Land Slope         0
Neighborhood       0
Condition 1        0
Condition 2        0
Bldg Type          0
House Style        0
Overall Qual       0
Overall Cond       0
Year Built         0
Year Remod/Add     0
Roof Style         0
Roof Matl          0
Exterior 1st       0
Exterior 2nd       0
Mas Vnr Type       0
Mas Vnr Area       0
Exter Qual         0
Exter Cond         0
Foundation         0
Bsmt Qual          0
Bsmt Cond          0
Bsmt Exposure      0
BsmtFin Type 1     0
BsmtFin SF 1       0
BsmtFin Type 2     0
BsmtFin SF 2       0
Bsmt Unf SF        0
Total Bsmt SF      0
Heating            0
Heating QC         0
Central Air        0
Electrical         0
1st Flr SF         0
2nd Flr SF         0
Low Qual Fin SF    0
Gr Liv Area        0
Bsmt Full Bat

After confirmation, all the Null values of the data have been cleaned and replaced.

# Look into non-numerical data

In [40]:
test.dtypes

Id                   int64
PID                  int64
MS SubClass          int64
MS Zoning           object
Lot Frontage       float64
Lot Area             int64
Street              object
Lot Shape           object
Land Contour        object
Utilities           object
Lot Config          object
Land Slope          object
Neighborhood        object
Condition 1         object
Condition 2         object
Bldg Type           object
House Style         object
Overall Qual         int64
Overall Cond         int64
Year Built           int64
Year Remod/Add       int64
Roof Style          object
Roof Matl           object
Exterior 1st        object
Exterior 2nd        object
Mas Vnr Type        object
Mas Vnr Area       float64
Exter Qual          object
Exter Cond          object
Foundation          object
Bsmt Qual           object
Bsmt Cond           object
Bsmt Exposure       object
BsmtFin Type 1      object
BsmtFin SF 1         int64
BsmtFin Type 2      object
BsmtFin SF 2         int64
B

In [41]:
test.select_dtypes([np.object])

Unnamed: 0,MS Zoning,Street,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,...,Central Air,Electrical,Kitchen Qual,Functional,Garage Type,Garage Finish,Garage Qual,Garage Cond,Paved Drive,Sale Type
0,RM,Pave,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,...,N,FuseP,Fa,Typ,Detchd,Unf,Po,Po,Y,WD
1,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Norm,...,Y,SBrkr,TA,Typ,Attchd,Fin,TA,TA,Y,WD
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,...,Y,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,New
3,RM,Pave,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,...,Y,SBrkr,TA,Typ,Detchd,Unf,Fa,TA,N,WD
4,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,Y,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
874,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,PosN,Norm,...,Y,SBrkr,TA,Typ,Attchd,Unf,TA,TA,Y,WD
875,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,Y,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD
876,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Sawyer,Feedr,Norm,...,Y,SBrkr,TA,Typ,Attchd,Unf,TA,TA,Y,WD
877,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,NAmes,Norm,Norm,...,Y,SBrkr,TA,Typ,Detchd,Unf,TA,TA,Y,WD


# Get_Dummies for all the Nominal variables

In [42]:
test[['MS Zoning' , 'Street' , 'Land Contour' , 'Lot Config' , 'Neighborhood' , 
      'Condition 1' , 'Condition 2' , 'Bldg Type' , 'House Style' , 'Roof Style' ,
      'Roof Matl' , 'Exterior 1st' , 'Exterior 2nd' , 'Mas Vnr Type' , 'Foundation' ,
      'Heating' , 'Central Air' , 'Garage Type' , 'Sale Type']]

Unnamed: 0,MS Zoning,Street,Land Contour,Lot Config,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Foundation,Heating,Central Air,Garage Type,Sale Type
0,RM,Pave,Lvl,Inside,OldTown,Norm,Norm,2fmCon,2Story,Gable,CompShg,AsbShng,AsbShng,,Stone,GasA,N,Detchd,WD
1,RL,Pave,Lvl,Inside,Sawyer,Norm,Norm,Duplex,1Story,Gable,CompShg,Plywood,Plywood,,CBlock,GasA,Y,Attchd,WD
2,RL,Pave,Lvl,Inside,Gilbert,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,,PConc,GasA,Y,Attchd,New
3,RM,Pave,Lvl,Inside,OldTown,Norm,Norm,1Fam,1Story,Gable,CompShg,Wd Sdng,Wd Sdng,,CBlock,GasA,Y,Detchd,WD
4,RL,Pave,Lvl,Inside,NAmes,Norm,Norm,1Fam,1Story,Gable,CompShg,Plywood,Plywood,BrkFace,CBlock,GasA,Y,Attchd,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
874,RL,Pave,Lvl,Inside,NWAmes,PosN,Norm,1Fam,2Story,Gable,CompShg,HdBoard,HdBoard,,CBlock,GasA,Y,Attchd,WD
875,RL,Pave,Lvl,Inside,NAmes,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,CBlock,GasA,Y,Attchd,WD
876,RL,Pave,Lvl,Inside,Sawyer,Feedr,Norm,1Fam,1Story,Hip,CompShg,HdBoard,HdBoard,,CBlock,GasA,Y,Attchd,WD
877,RL,Pave,Lvl,FR2,NAmes,Norm,Norm,1Fam,1Story,Gable,CompShg,HdBoard,HdBoard,,PConc,GasA,Y,Detchd,WD


In [43]:
# Get_Dummies
test = pd.get_dummies(test, columns=['MS Zoning' , 'Street' , 'Land Contour' , 'Lot Config' , 'Neighborhood' , 
                                     'Condition 1' , 'Condition 2' , 'Bldg Type' , 'House Style' , 'Roof Style' ,
                                     'Roof Matl' , 'Exterior 1st' , 'Exterior 2nd' , 'Mas Vnr Type' , 'Foundation' ,
                                     'Heating' , 'Central Air' , 'Garage Type' , 'Sale Type'])

In [44]:
# Check the new table.
test

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,2658,902301120,190,69.0,9142,Reg,AllPub,Gtl,6,8,...,0,0,0,0,0,0,0,0,0,1
1,2718,905108090,90,60.0,9662,IR1,AllPub,Gtl,5,4,...,0,0,0,0,0,0,0,0,0,1
2,2414,528218130,60,58.0,17104,IR1,AllPub,Gtl,7,5,...,0,0,0,0,0,0,1,0,0,0
3,1989,902207150,30,60.0,8520,Reg,AllPub,Gtl,5,6,...,0,0,0,0,0,0,0,0,0,1
4,625,535105100,20,60.0,9500,IR1,AllPub,Gtl,6,5,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
874,1662,527377110,60,80.0,8000,Reg,AllPub,Gtl,6,6,...,0,0,0,0,0,0,0,0,0,1
875,1234,535126140,60,90.0,14670,Reg,AllPub,Gtl,6,7,...,0,0,0,0,0,0,0,0,0,1
876,1373,904100040,20,55.0,8250,Reg,AllPub,Gtl,5,5,...,0,0,0,0,0,0,0,0,0,1
877,1672,527425140,20,60.0,9000,Reg,AllPub,Gtl,4,6,...,0,0,0,0,0,0,0,0,0,1


In [45]:
# Check the new columns
test.columns

Index(['Id', 'PID', 'MS SubClass', 'Lot Frontage', 'Lot Area', 'Lot Shape',
       'Utilities', 'Land Slope', 'Overall Qual', 'Overall Cond',
       ...
       'Sale Type_COD', 'Sale Type_CWD', 'Sale Type_Con', 'Sale Type_ConLD',
       'Sale Type_ConLI', 'Sale Type_ConLw', 'Sale Type_New', 'Sale Type_Oth',
       'Sale Type_VWD', 'Sale Type_WD '],
      dtype='object', length=199)

# Do Mapping for all the Ordinal variables

In [46]:
test[['Lot Shape' , 'Utilities' , 'Land Slope' , 'Exter Qual' , 'Exter Cond' ,
      'Bsmt Qual' , 'Bsmt Cond' , 'Bsmt Exposure' , 'BsmtFin Type 1' , 'BsmtFin Type 2' ,
      'Heating QC' , 'Electrical' , 'Kitchen Qual' , 'Functional' , 'Garage Finish' ,
      'Garage Qual' , 'Garage Cond' , 'Paved Drive']]

Unnamed: 0,Lot Shape,Utilities,Land Slope,Exter Qual,Exter Cond,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin Type 2,Heating QC,Electrical,Kitchen Qual,Functional,Garage Finish,Garage Qual,Garage Cond,Paved Drive
0,Reg,AllPub,Gtl,TA,Fa,Fa,TA,No,Unf,Unf,Gd,FuseP,Fa,Typ,Unf,Po,Po,Y
1,IR1,AllPub,Gtl,TA,TA,Gd,TA,No,Unf,Unf,TA,SBrkr,TA,Typ,Fin,TA,TA,Y
2,IR1,AllPub,Gtl,Gd,TA,Gd,Gd,Av,GLQ,Unf,Ex,SBrkr,Gd,Typ,RFn,TA,TA,Y
3,Reg,AllPub,Gtl,Gd,TA,TA,TA,No,Unf,Unf,TA,SBrkr,TA,Typ,Unf,Fa,TA,N
4,IR1,AllPub,Gtl,TA,TA,Gd,TA,No,BLQ,Unf,Gd,SBrkr,TA,Typ,RFn,TA,TA,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
874,Reg,AllPub,Gtl,TA,TA,TA,TA,No,ALQ,LwQ,TA,SBrkr,TA,Typ,Unf,TA,TA,Y
875,Reg,AllPub,Gtl,Gd,Gd,TA,TA,No,BLQ,Unf,Ex,SBrkr,Gd,Typ,RFn,TA,TA,Y
876,Reg,AllPub,Gtl,TA,TA,TA,TA,No,BLQ,LwQ,Ex,SBrkr,TA,Typ,Unf,TA,TA,Y
877,Reg,AllPub,Gtl,TA,TA,TA,TA,No,ALQ,Unf,TA,SBrkr,TA,Typ,Unf,TA,TA,Y


Mapping the `Lot Shape` data.

In [47]:
test['Lot Shape'].value_counts(dropna=False)

Reg    564
IR1    287
IR2     21
IR3      7
Name: Lot Shape, dtype: int64

In [48]:
Lot_dict = {'Reg':1 , 'IR1':2 , 'IR2':3 , 'IR3':4}

In [49]:
test['Lot Shape'] = test['Lot Shape'].map(Lot_dict)

In [50]:
test['Lot Shape'].value_counts(dropna=False)

1    564
2    287
3     21
4      7
Name: Lot Shape, dtype: int64

Mapping the `Utilities` data.

In [51]:
test['Utilities'].value_counts(dropna=False)

AllPub    878
NoSewr      1
Name: Utilities, dtype: int64

In [52]:
Util_dict = {'AllPub':1 , 'NoSewr':2}

In [53]:
test['Utilities'] = test['Utilities'].map(Util_dict)

In [54]:
test['Utilities'].value_counts(dropna=False)

1    878
2      1
Name: Utilities, dtype: int64

Mapping the `Land Slope` data.

In [55]:
test['Land Slope'].value_counts(dropna=False)

Gtl    836
Mod     37
Sev      6
Name: Land Slope, dtype: int64

In [56]:
Land_dict = {'Gtl':1 , 'Mod':2 , 'Sev':3}

In [57]:
test['Land Slope'] = test['Land Slope'].map(Land_dict)

In [58]:
test['Land Slope'].value_counts(dropna=False)

1    836
2     37
3      6
Name: Land Slope, dtype: int64

Mapping the `Exter Qual` data.

In [59]:
test['Exter Qual'].value_counts(dropna=False)

TA    552
Gd    292
Ex     26
Fa      9
Name: Exter Qual, dtype: int64

In [60]:
Exter_dict = {'Ex':1 , 'Gd':2 , 'TA':3 , 'Fa':4}

In [61]:
test['Exter Qual'] = test['Exter Qual'].map(Exter_dict)

In [62]:
test['Exter Qual'].value_counts(dropna=False)

3    552
2    292
1     26
4      9
Name: Exter Qual, dtype: int64

Mapping the `Exter Cond` data.

In [63]:
test['Exter Cond'].value_counts(dropna=False)

TA    771
Gd     84
Fa     18
Ex      5
Po      1
Name: Exter Cond, dtype: int64

In [64]:
Exter_dict = {'Ex':1 , 'Gd':2 , 'TA':3 , 'Fa':4 , 'Po':5}

In [65]:
test['Exter Cond'] = test['Exter Cond'].map(Exter_dict)

In [66]:
test['Exter Cond'].value_counts(dropna=False)

3    771
2     84
4     18
1      5
5      1
Name: Exter Cond, dtype: int64

Mapping the `Bsmt Qual` data.

In [67]:
test['Bsmt Qual'].value_counts(dropna=False)

TA    421
Gd    355
Ex     74
Fa     28
Po      1
Name: Bsmt Qual, dtype: int64

In [68]:
Bsmt_dict = {'Ex':1 , 'Gd':2 , 'TA':3 , 'Fa':4 , 'Po':5}

In [69]:
test['Bsmt Qual'] = test['Bsmt Qual'].map(Bsmt_dict)

In [70]:
test['Bsmt Qual'].value_counts(dropna=False)

3    421
2    355
1     74
4     28
5      1
Name: Bsmt Qual, dtype: int64

Mapping the `Bsmt Cond` data.

In [71]:
test['Bsmt Cond'].value_counts(dropna=False)

TA    807
Fa     39
Gd     33
Name: Bsmt Cond, dtype: int64

In [72]:
Bsmt_dict = {'Gd':2 , 'TA':3 , 'Fa':4}

In [73]:
test['Bsmt Cond'] = test['Bsmt Cond'].map(Bsmt_dict)

In [74]:
test['Bsmt Cond'].value_counts(dropna=False)

3    807
4     39
2     33
Name: Bsmt Cond, dtype: int64

Mapping the `Bsmt Exposure` data.

In [75]:
test['Bsmt Exposure'].value_counts(dropna=False)

No    592
Av    130
Gd     81
Mn     76
Name: Bsmt Exposure, dtype: int64

In [76]:
Bsmt_dict = {'Gd':1 , 'Av':2 , 'Mn':3 , 'No':4}

In [77]:
test['Bsmt Exposure'] = test['Bsmt Exposure'].map(Bsmt_dict)

In [78]:
test['Bsmt Exposure'].value_counts(dropna=False)

4    592
2    130
1     81
3     76
Name: Bsmt Exposure, dtype: int64

Mapping the `BsmtFin Type 1` data.

In [79]:
test['BsmtFin Type 1'].value_counts(dropna=False)

Unf    248
GLQ    244
ALQ    136
Rec    105
BLQ     69
LwQ     52
NA      25
Name: BsmtFin Type 1, dtype: int64

In [80]:
BsmtFin_dict = {'GLQ':1 , 'ALQ':2 , 'BLQ':3 , 'Rec':4 , 'LwQ':5 , 'Unf':6 , 'NA':7}

In [81]:
test['BsmtFin Type 1'] = test['BsmtFin Type 1'].map(BsmtFin_dict)

In [82]:
test['BsmtFin Type 1'].value_counts(dropna=False)

6    248
1    244
2    136
4    105
3     69
5     52
7     25
Name: BsmtFin Type 1, dtype: int64

Mapping the `BsmtFin Type 2` data.

In [83]:
test['BsmtFin Type 2'].value_counts(dropna=False)

Unf    750
LwQ     29
Rec     26
NA      25
BLQ     20
ALQ     18
GLQ     11
Name: BsmtFin Type 2, dtype: int64

In [84]:
BsmtFin_dict = {'GLQ':1 , 'ALQ':2 , 'BLQ':3 , 'Rec':4 , 'LwQ':5 , 'Unf':6 , 'NA':7}

In [85]:
test['BsmtFin Type 2'] = test['BsmtFin Type 2'].map(BsmtFin_dict)

In [86]:
test['BsmtFin Type 2'].value_counts(dropna=False)

6    750
5     29
4     26
7     25
3     20
2     18
1     11
Name: BsmtFin Type 2, dtype: int64

Mapping the `Heating QC` data.

In [87]:
test['Heating QC'].value_counts(dropna=False)

Ex    430
TA    267
Gd    157
Fa     25
Name: Heating QC, dtype: int64

In [88]:
Heating_dict = {'Ex':1 , 'Gd':2 , 'TA':3 , 'Fa':4}

In [89]:
test['Heating QC'] = test['Heating QC'].map(Heating_dict)

In [90]:
test['Heating QC'].value_counts(dropna=False)

1    430
3    267
2    157
4     25
Name: Heating QC, dtype: int64

Mapping the `Electrical` data.

In [91]:
test['Electrical'].value_counts(dropna=False)

SBrkr    815
FuseA     48
FuseF     15
FuseP      1
Name: Electrical, dtype: int64

In [92]:
Elec_dict = {'SBrkr':1 , 'FuseA':2 , 'FuseF':3 , 'FuseP':4}

In [93]:
test['Electrical'] = test['Electrical'].map(Elec_dict)

In [94]:
test['Electrical'].value_counts(dropna=False)

1    815
2     48
3     15
4      1
Name: Electrical, dtype: int64

Mapping the `Kitchen Qual` data.

In [95]:
test['Kitchen Qual'].value_counts(dropna=False)

TA    447
Gd    354
Ex     54
Fa     23
Po      1
Name: Kitchen Qual, dtype: int64

In [96]:
Kit_dict = {'Ex':1 , 'Gd':2 , 'TA':3 , 'Fa':4 , 'Po':5}

In [97]:
test['Kitchen Qual'] = test['Kitchen Qual'].map(Kit_dict)

In [98]:
test['Kitchen Qual'].value_counts(dropna=False)

3    447
2    354
1     54
4     23
5      1
Name: Kitchen Qual, dtype: int64

Mapping the `Functional` data.

In [99]:
test['Functional'].value_counts(dropna=False)

Typ     813
Min2     28
Min1     23
Maj1      7
Mod       6
Maj2      2
Name: Functional, dtype: int64

In [100]:
Fun_dict = {'Typ':1 , 'Min1':2 , 'Min2':3 , 'Mod':4 , 'Maj1':5 , 'Maj2':6}

In [101]:
test['Functional'] = test['Functional'].map(Fun_dict)

In [102]:
test['Functional'].value_counts(dropna=False)

1    813
3     28
2     23
5      7
4      6
6      2
Name: Functional, dtype: int64

Mapping the `Garage Finish` data.

In [103]:
test['Garage Finish'].value_counts(dropna=False)

Unf    427
RFn    233
Fin    219
Name: Garage Finish, dtype: int64

In [104]:
Gar_dict = {'Fin':1 , 'RFn':2 , 'Unf':3}

In [105]:
test['Garage Finish'] = test['Garage Finish'].map(Gar_dict)

In [106]:
test['Garage Finish'].value_counts(dropna=False)

3    427
2    233
1    219
Name: Garage Finish, dtype: int64

Mapping the `Garage Qual` data.

In [107]:
test['Garage Qual'].value_counts(dropna=False)

TA    828
Fa     42
Gd      6
Po      3
Name: Garage Qual, dtype: int64

In [108]:
Gar_dict = {'Ex':1 , 'Gd':2 , 'TA':3 , 'Fa':4 , 'Po':5}

In [109]:
test['Garage Qual'] = test['Garage Qual'].map(Gar_dict)

In [110]:
test['Garage Qual'].value_counts(dropna=False)

3    828
4     42
2      6
5      3
Name: Garage Qual, dtype: int64

Mapping the `Garage Cond` data.

In [111]:
test['Garage Cond'].value_counts(dropna=False)

TA    842
Fa     27
Po      6
Gd      3
Ex      1
Name: Garage Cond, dtype: int64

In [112]:
Gar_dict = {'Ex':1 , 'Gd':2 , 'TA':3 , 'Fa':4 , 'Po':5}

In [113]:
test['Garage Cond'] = test['Garage Cond'].map(Gar_dict)

In [114]:
test['Garage Cond'].value_counts(dropna=False)

3    842
4     27
5      6
2      3
1      1
Name: Garage Cond, dtype: int64

Mapping the `Paved Drive` data.

In [115]:
test['Paved Drive'].value_counts(dropna=False)

Y    791
N     65
P     23
Name: Paved Drive, dtype: int64

In [116]:
Paved_dict = {'Y':1 , 'P':2 , 'N':3}

In [117]:
test['Paved Drive'] = test['Paved Drive'].map(Paved_dict)

In [118]:
test['Paved Drive'].value_counts(dropna=False)

1    791
3     65
2     23
Name: Paved Drive, dtype: int64

In [119]:
# Check my test data
test.head()

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,2658,902301120,190,69.0,9142,1,1,1,6,8,...,0,0,0,0,0,0,0,0,0,1
1,2718,905108090,90,60.0,9662,2,1,1,5,4,...,0,0,0,0,0,0,0,0,0,1
2,2414,528218130,60,58.0,17104,2,1,1,7,5,...,0,0,0,0,0,0,1,0,0,0
3,1989,902207150,30,60.0,8520,1,1,1,5,6,...,0,0,0,0,0,0,0,0,0,1
4,625,535105100,20,60.0,9500,2,1,1,6,5,...,0,0,0,0,0,0,0,0,0,1


In [120]:
# Check my test columns
test.columns

Index(['Id', 'PID', 'MS SubClass', 'Lot Frontage', 'Lot Area', 'Lot Shape',
       'Utilities', 'Land Slope', 'Overall Qual', 'Overall Cond',
       ...
       'Sale Type_COD', 'Sale Type_CWD', 'Sale Type_Con', 'Sale Type_ConLD',
       'Sale Type_ConLI', 'Sale Type_ConLw', 'Sale Type_New', 'Sale Type_Oth',
       'Sale Type_VWD', 'Sale Type_WD '],
      dtype='object', length=199)

# Insert my SalePrice column

In [121]:
# Create a new column as SalePrice
test['SalePrice'] = 0

In [122]:
# Check my test data
test.head()

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,...,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,SalePrice
0,2658,902301120,190,69.0,9142,1,1,1,6,8,...,0,0,0,0,0,0,0,0,1,0
1,2718,905108090,90,60.0,9662,2,1,1,5,4,...,0,0,0,0,0,0,0,0,1,0
2,2414,528218130,60,58.0,17104,2,1,1,7,5,...,0,0,0,0,0,1,0,0,0,0
3,1989,902207150,30,60.0,8520,1,1,1,5,6,...,0,0,0,0,0,0,0,0,1,0
4,625,535105100,20,60.0,9500,2,1,1,6,5,...,0,0,0,0,0,0,0,0,1,0


In [123]:
# Check my test columns
test.columns

Index(['Id', 'PID', 'MS SubClass', 'Lot Frontage', 'Lot Area', 'Lot Shape',
       'Utilities', 'Land Slope', 'Overall Qual', 'Overall Cond',
       ...
       'Sale Type_CWD', 'Sale Type_Con', 'Sale Type_ConLD', 'Sale Type_ConLI',
       'Sale Type_ConLw', 'Sale Type_New', 'Sale Type_Oth', 'Sale Type_VWD',
       'Sale Type_WD ', 'SalePrice'],
      dtype='object', length=200)

# Build a same model as the training result

In [128]:
X = test[['Overall Qual' , 'Gr Liv Area' , 'Total Bsmt SF' , 'Garage Area' , 
          'Year Built' , 'Year Remod/Add' , 'Mas Vnr Area' , 'Fireplaces' ,
          'BsmtFin SF 1' , 'Lot Frontage' , 'Lot Area' , 'Exter Qual' ,
          'Kitchen Qual' , 'Bsmt Qual' ,  'Exterior 1st_BrkFace' , 'Exterior 1st_CemntBd' , 
          'Neighborhood_Crawfor' , 'Neighborhood_NoRidge' , 'Neighborhood_NridgHt' , 'Neighborhood_StoneBr' , 
          'Mas Vnr Type_BrkCmn' , 'Mas Vnr Type_BrkFace']]

In [129]:
y = test['SalePrice']

In [130]:
X.to_csv('../dataset/attempt3.csv',index=False)