In [39]:
import pandas as pd
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from geopy.geoencoders import Nominatim 

# load dataset
data = pd.read_csv('datasets/kc_house_data.csv')

In [8]:
# data dimensions
print('Number of rows: {}'.format(data.shape[0] ))
print('Number of columns {}'.format(data.shape[0] ))

Number of rows: 21613
Number of columns 21613


In [9]:
# data types
data.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [10]:
# convert object to date
data['date'] = pd.to_datetime(data ['date'])

In [11]:
data.dtypes

id                        int64
date             datetime64[ns]
price                   float64
bedrooms                  int64
bathrooms               float64
sqft_living               int64
sqft_lot                  int64
floors                  float64
waterfront                int64
view                      int64
condition                 int64
grade                     int64
sqft_above                int64
sqft_basement             int64
yr_built                  int64
yr_renovated              int64
zipcode                   int64
lat                     float64
long                    float64
sqft_living15             int64
sqft_lot15                int64
dtype: object

In [41]:
# descriptive statistics
num_attributes = data.select_dtypes(include=['int64', 'float64'])

# central tendency - media, mediana
media = pd.DataFrame (num_attributes.apply(np.mean, axis=0) )
mediana = pd.DataFrame (num_attributes.apply(np.median, axis=0) )

# dispersion - std, min, max
std = pd.DataFrame (num_attributes.apply (np.std, axis=0) )
min_ = pd.DataFrame (num_attributes.apply (np.min, axis=0) )
max_ = pd.DataFrame (num_attributes.apply (np.max, axis=0) )

df1 = pd.concat ( [max_, min_, media, mediana, std], axis=1).reset_index()
df1.columns = ['attributes', 'maximo', 'minimo', 'media', 'mediana','std']

In [40]:
df1

Unnamed: 0,attributes,maximo,minimo,media,mediana,std
0,id,9900000190.0,1000102.0,4580301520.865,3904930410.0,2876499023.428
1,price,7700000.0,75000.0,540088.142,450000.0,367118.703
2,bedrooms,33.0,0.0,3.371,3.0,0.93
3,bathrooms,8.0,0.0,2.115,2.25,0.77
4,sqft_living,13540.0,290.0,2079.9,1910.0,918.42
5,sqft_lot,1651359.0,520.0,15106.968,7618.0,41419.553
6,floors,3.5,1.0,1.494,1.5,0.54
7,waterfront,1.0,0.0,0.008,0.0,0.087
8,view,4.0,0.0,0.234,0.0,0.766
9,condition,5.0,1.0,3.409,3.0,0.651
