In [1]:
pip install seaborn matplotlib

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
#imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [3]:
#load csv
housing = pd.read_csv("Housing_Price_Data.csv")
print(housing.head())

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  


In [4]:
#understand shape
housing.shape

(545, 13)

In [5]:
#understand data types
housing.dtypes

price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object

In [6]:
#Summary Statistics
#Summary Stats for Numerical Columns
pd.set_option('display.float_format', '{:.2f}'.format)
housing.describe()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.25,5150.54,2.97,1.29,1.81,0.69
std,1870439.62,2170.14,0.74,0.5,0.87,0.86
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [7]:
#Summary Stats for Categorical Data
#Unique values in categorical data columns
for column in housing.columns:
    if housing[column].dtype == 'object':
        print(f"Unique values in column {column}: {housing[column].unique()}")
    

Unique values in column mainroad: ['yes' 'no']
Unique values in column guestroom: ['no' 'yes']
Unique values in column basement: ['no' 'yes']
Unique values in column hotwaterheating: ['no' 'yes']
Unique values in column airconditioning: ['yes' 'no']
Unique values in column prefarea: ['yes' 'no']
Unique values in column furnishingstatus: ['furnished' 'semi-furnished' 'unfurnished']


In [8]:
#Summary Statistics for Categorical Data
for column in housing.columns:
    if housing[column].dtype == 'object':
        print(f"Total count of values in column: {housing[column].value_counts()}")

Total count of values in column: mainroad
yes    468
no      77
Name: count, dtype: int64
Total count of values in column: guestroom
no     448
yes     97
Name: count, dtype: int64
Total count of values in column: basement
no     354
yes    191
Name: count, dtype: int64
Total count of values in column: hotwaterheating
no     520
yes     25
Name: count, dtype: int64
Total count of values in column: airconditioning
no     373
yes    172
Name: count, dtype: int64
Total count of values in column: prefarea
no     417
yes    128
Name: count, dtype: int64
Total count of values in column: furnishingstatus
semi-furnished    227
unfurnished       178
furnished         140
Name: count, dtype: int64


In [9]:
#Determine if there are any missing values:
housing.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [10]:
#Correlation Analysis
#I want to create a heat map, so I'll have to go ahead and create some dummies 
#loop through categorical columns to convert to dummies

#initialize dummy df
housing_dummies = pd.DataFrame()

#pull list of categorical data columns, to be used later....
categorical_cols = housing.select_dtypes(include=['object']).columns

for column in housing:
        #create dummy variables
        dummies = pd.get_dummies(housing[column], prefix=column, drop_first=True)
        #concatenate dummies to new dataframe
        housing_dummies = pd.concat([housing_dummies, dummies], axis=1)

print(housing_dummies.head())

   price_1767150  price_1820000  price_1855000  price_1890000  price_1960000  \
0          False          False          False          False          False   
1          False          False          False          False          False   
2          False          False          False          False          False   
3          False          False          False          False          False   
4          False          False          False          False          False   

   price_2100000  price_2135000  price_2233000  price_2240000  price_2275000  \
0          False          False          False          False          False   
1          False          False          False          False          False   
2          False          False          False          False          False   
3          False          False          False          False          False   
4          False          False          False          False          False   

   ...  guestroom_yes  basement_yes  h

In [11]:
#create new df
#set equal to housing to create a clone of original df
housing_final = housing
#drop categorical columns, concat dummies with final df
housing_final = pd.concat([housing_final.drop(categorical_cols, axis=1), housing_dummies], axis=1)

print(housing_final)

        price  area  bedrooms  bathrooms  stories  parking  price_1767150  \
0    13300000  7420         4          2        3        2          False   
1    12250000  8960         4          4        4        3          False   
2    12250000  9960         3          2        2        2          False   
3    12215000  7500         4          2        2        3          False   
4    11410000  7420         4          1        2        2          False   
..        ...   ...       ...        ...      ...      ...            ...   
540   1820000  3000         2          1        1        2          False   
541   1767150  2400         3          1        1        0           True   
542   1750000  3620         2          1        1        0          False   
543   1750000  2910         3          1        1        0          False   
544   1750000  3850         3          1        2        0          False   

     price_1820000  price_1855000  price_1890000  ...  guestroom_yes  \
0  

In [None]:
#Correlation Analysis
housing_corr_matrix = housing_final.corr()
sns.heatmap(housing_corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
#Since that doesn't want to work, we'll try creating scatter plots to look at correlation!
#lets confirm data types in the new table
print(housing_final)