In [1]:
# Import California housing database
from sklearn.datasets import fetch_california_housing

# Assign data to dictionary
data = fetch_california_housing()

# Create a print divider
SYMBOL = "-"
NUMSYM = 125
divider = ""
divrange = range(NUMSYM)
for i in divrange:
    divider += SYMBOL
    
# Print various info to see what is available
print("\n%s\nDATA KEYS:\n%s\n" % (divider, divider))
print(data.keys())


-----------------------------------------------------------------------------------------------------------------------------
DATA KEYS:
-----------------------------------------------------------------------------------------------------------------------------

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])


In [2]:
print("\n%s\nDATA DESCRIPTION:\n%s\n" % (divider, divider))
print(data.DESCR)


-----------------------------------------------------------------------------------------------------------------------------
DATA DESCRIPTION:
-----------------------------------------------------------------------------------------------------------------------------

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
htt

In [3]:
print("\n%s\nDATA ATTRIBUTES:\n%s\n" % (divider, divider))
print(data.feature_names)


-----------------------------------------------------------------------------------------------------------------------------
DATA ATTRIBUTES:
-----------------------------------------------------------------------------------------------------------------------------

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [4]:
print("\n%s\nDATA STRUCTURE:\n%s\n" % (divider, divider))

print("Dataset type: %s\n" % type(data.data))
print(data.data)

print("\nRow type: %s\n" % type(data.data[0]))
print(data.data[0])

print("\n%s %15.6f %15s %25s" % ("Row[0]:", data.data[0][0], "Type:", type(data.data[0][0])))
print("\n%s %15.6f %15s %25s" % ("Row[1]:", data.data[0][1], "Type:", type(data.data[0][1])))
print("\n%s %15.6f %15s %25s" % ("Row[2]:", data.data[0][2], "Type:", type(data.data[0][2])))
print("\n%s %15.6f %15s %25s" % ("Row[3]:", data.data[0][3], "Type:", type(data.data[0][3])))
print("\n%s %15.6f %15s %25s" % ("Row[4]:", data.data[0][4], "Type:", type(data.data[0][4])))
print("\n%s %15.6f %15s %25s" % ("Row[5]:", data.data[0][5], "Type:", type(data.data[0][5])))
print("\n%s %15.6f %15s %25s" % ("Row[6]:", data.data[0][6], "Type:", type(data.data[0][6])))
print("\n%s %15.6f %15s %25s" % ("Row[7]:", data.data[0][7], "Type:", type(data.data[0][7])))


-----------------------------------------------------------------------------------------------------------------------------
DATA STRUCTURE:
-----------------------------------------------------------------------------------------------------------------------------

Dataset type: <class 'numpy.ndarray'>

[[   8.3252       41.            6.98412698 ...    2.55555556
    37.88       -122.23      ]
 [   8.3014       21.            6.23813708 ...    2.10984183
    37.86       -122.22      ]
 [   7.2574       52.            8.28813559 ...    2.80225989
    37.85       -122.24      ]
 ...
 [   1.7          17.            5.20554273 ...    2.3256351
    39.43       -121.22      ]
 [   1.8672       18.            5.32951289 ...    2.12320917
    39.43       -121.32      ]
 [   2.3886       16.            5.25471698 ...    2.61698113
    39.37       -121.24      ]]

Row type: <class 'numpy.ndarray'>

[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       

In [5]:
print("\n%s\nMIN, MAX, MEAN, MEDIAN, MODE:\n%s\n" % (divider, divider))

def getMin(lst):
    lst.sort()
    return lst[0]

def getMax(lst):
    lst.sort(reverse = True)
    return lst[0]

def getMean(lst):
    return sum(lst) / len(lst)
    
def getMedian(lst):
    lst.sort()
    if len(lst) % 2 == 1:
        index = int(len(lst) / 2)
        return lst[index]
    else:
        midR = int(len(lst) / 2)
        midL = midR - 1
        return ((lst[midL] + lst[midR]) / 2)

def getMode(lst):
    mode = 0.0
    d = dict()
    for element in lst:
        if element not in d:
            d[element] = 1
        else:
            d[element] = d[element] + 1

    keyList = list(d.keys())
    valList = list(d.values())
    valList.sort(reverse = True)
    
    for k in keyList:
        if d[k] == valList[0]:
            mode = float(k)
            break
                
    return mode

sheet = range(len(data.data))
MedianIncome = []
MedianHouseAge = []
AvgNumRooms = []
AvgNumBedrooms = []
Population = []
AvgOccupancy = []

for row in sheet:
    MedianIncome.append(float(data.data[row][0]))
    MedianHouseAge.append(float(data.data[row][1]))
    AvgNumRooms.append(float(data.data[row][2]))
    AvgNumBedrooms.append(float(data.data[row][3]))
    Population.append(float(data.data[row][4]))
    AvgOccupancy.append(float(data.data[row][5]))
    
print("\n%20s %10s %15.6f %10s %15.6f %10s %15.6f" % ("MedianIncome -->", "Min:", getMin(MedianIncome), "Mean:", getMean(MedianIncome), "Mode:", getMode(MedianIncome)))
print("\n%20s %10s %15.6f %10s %15.6f" % (" ", "Max:", getMax(MedianIncome), "Median:", getMedian(MedianIncome)))

print("\n%20s %10s %15.6f %10s %15.6f %10s %15.6f" % ("MedianHouseAge -->", "Min:", getMin(MedianHouseAge), "Mean:", getMean(MedianHouseAge), "Mode:", getMode(MedianHouseAge)))
print("\n%20s %10s %15.6f %10s %15.6f" % (" ", "Max:", getMax(MedianHouseAge), "Median:", getMedian(MedianHouseAge)))

print("\n%20s %10s %15.6f %10s %15.6f %10s %15.6f" % ("AvgNumRooms -->", "Min:", getMin(AvgNumRooms), "Mean:", getMean(AvgNumRooms), "Mode:", getMode(AvgNumRooms)))
print("\n%20s %10s %15.6f %10s %15.6f" % (" ", "Max:", getMax(AvgNumRooms), "Median:", getMedian(AvgNumRooms)))

print("\n%20s %10s %15.6f %10s %15.6f %10s %15.6f" % ("AvgNumBedrooms -->", "Min:", getMin(AvgNumBedrooms), "Mean:", getMean(AvgNumBedrooms), "Mode:", getMode(AvgNumBedrooms)))
print("\n%20s %10s %15.6f %10s %15.6f" % (" ", "Max:", getMax(AvgNumBedrooms), "Median:", getMedian(AvgNumBedrooms)))

print("\n%20s %10s %15.6f %10s %15.6f %10s %15.6f" % ("Population -->", "Min:", getMin(Population), "Mean:", getMean(Population), "Mode:", getMode(Population)))
print("\n%20s %10s %15.6f %10s %15.6f" % (" ", "Max:", getMax(Population), "Median:", getMedian(Population)))

print("\n%20s %10s %15.6f %10s %15.6f %10s %15.6f" % ("AvgOccupancy -->", "Min:", getMin(AvgOccupancy), "Mean:", getMean(AvgOccupancy), "Mode:", getMode(AvgOccupancy)))
print("\n%20s %10s %15.6f %10s %15.6f" % (" ", "Max:", getMax(AvgOccupancy), "Median:", getMedian(AvgOccupancy)))


-----------------------------------------------------------------------------------------------------------------------------
MIN, MAX, MEAN, MEDIAN, MODE:
-----------------------------------------------------------------------------------------------------------------------------


    MedianIncome -->       Min:        0.499900      Mean:        3.870671      Mode:        3.125000

                           Max:       15.000100    Median:        3.534800

  MedianHouseAge -->       Min:        1.000000      Mean:       28.639486      Mode:       52.000000

                           Max:       52.000000    Median:       29.000000

     AvgNumRooms -->       Min:        0.846154      Mean:        5.429000      Mode:        5.000000

                           Max:      141.909091    Median:        5.229129

  AvgNumBedrooms -->       Min:        0.333333      Mean:        1.096675      Mode:        1.000000

                           Max:       34.066667    Median:        1.048780


In [6]:
# Display Median House Age in Block

def histogram(lst):
    d = dict()
    for element in lst:
        if element not in d:
            d[element] = 1
        else:
            d[element] = d[element] + 1
    return d

def printHistDesc(histogram):
    keyList = list(histogram.keys())
    valList = list(histogram.values())
    valList.sort(reverse = True)
    
    print("\nMEDIAN HOUSE AGE IN BLOCK (LISTED FROM MOST TO LEAST COMMON):\n")
    print("%10s %10s\n" % ("Years", "Frequency"))
    
    for v in valList:
        for k in keyList:
            if histogram[k] == v:
                print("%10d %10d" % (int(k), int(v)))
        
hist = histogram(MedianHouseAge)
printHistDesc(hist)


MEDIAN HOUSE AGE IN BLOCK (LISTED FROM MOST TO LEAST COMMON):

     Years  Frequency

        52       1273
        36        862
        35        824
        16        771
        17        698
        34        689
        26        619
        33        615
        18        570
        25        566
        32        565
        37        537
        15        512
        19        502
        27        488
        24        478
        30        476
        28        471
        20        465
        29        461
        31        458
        23        448
        21        446
        14        412
        22        399
        38        394
        39        369
        42        368
        44        356
        43        353
        40        304
        13        302
        41        296
        45        294
        10        264
        11        254
        46        245
         5        244
        12        238
         8        206
         9        205
        47 

In [7]:
# Exporting data to Excel
PATH = "C:\california_housing.txt"

# Open file
file = open(PATH, "w")
print("\nFile opened: %s" % PATH)

# Prepare headers as Comma Separated Values (CSV)
headers = ""
for header in data.feature_names:
    headers += header
    headers += ","
    
headers += "\n"

# Write headers
file.write(headers)
print("\nHeaders: %s" % headers)

# Prepare data as CSV and write rows
rowCount = 0

for row in data.data:
    line = ""
    for element in row:
        line += str(element)
        line += ","
    
    line += "\n"
    file.write(line)
    rowCount += 1
    
print("Rows written: %d" % rowCount)

# Close file
file.close()
print("\nFile closed: %s" % PATH)


File opened: C:\california_housing.txt

Headers: MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,

Rows written: 20640

File closed: C:\california_housing.txt
