# #1: Import California housing data
## https://github.com/ageron/handson-ml/blob/master/datasets/housing/housing.csv
### Setup from book

In [491]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [492]:
fetch_housing_data()

In [493]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

# #2: Details about data

In [494]:
df = load_housing_data()
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [495]:
# Details about dataframe
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [496]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [497]:
# Find missing values count in rows for each column
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [498]:
# Check replaced missing value type for row #1097 with missing value
df.iloc[1097]

longitude            -121.77
latitude               39.66
housing_median_age        20
total_rooms             3759
total_bedrooms           NaN
population              1705
households               600
median_income          4.712
median_house_value    158600
ocean_proximity       INLAND
Name: 1097, dtype: object

# #3: Replace mean for missing value

In [499]:
# Find mean and assign variable
mean = df['total_bedrooms'].mean()
mean

537.8705525375618

In [500]:
# Fill missing values (NaN) with mean
df['total_bedrooms'].fillna(mean, inplace=True)

In [501]:
# Check to make sure mean has been replaced in missing value row #1097
df.iloc[1097]

longitude             -121.77
latitude                39.66
housing_median_age         20
total_rooms              3759
total_bedrooms        537.871
population               1705
households                600
median_income           4.712
median_house_value     158600
ocean_proximity        INLAND
Name: 1097, dtype: object

# #4: Replace spaces in string values 
## Column: ocean_proximity

In [502]:
df["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [503]:
# Replace space with underscore for <1H OCEAN
df["ocean_proximity"].replace("<1H OCEAN", "1H_OCEAN",inplace=True)

In [504]:
# Replace space with underscore for NEAR OCEAN
df["ocean_proximity"].replace("NEAR OCEAN", "NEAR_OCEAN", inplace=True)

In [505]:
# Replace space with underscore for NEAR BAY
df["ocean_proximity"].replace("NEAR BAY", "NEAR_BAY",inplace=True)

In [506]:
# Check for replacements
df["ocean_proximity"].value_counts()

1H_OCEAN      9136
INLAND        6551
NEAR_OCEAN    2658
NEAR_BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

# #5: Export dataframe to arff format

In [507]:
# Create a clean data csv without header and index
df.to_csv("clean_housingdata.csv", header=False, index=False)

In [508]:
# New clean df without header
df_clean = pd.read_csv("clean_housingdata.csv")

In [511]:
# Check data type names
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [512]:
# Write .arff file
f = open("mahmud_californiahousingdata-cleaned.arff","w")

add2arff = []

# Append Title as comment (%) and @relation
add2arff.append("% TITLE: California Housing Data (Cleaned)" 
                +"\n\n"
                +"@relation californiahousingdata"
                +"\n\n")

# FOR loop through column names
for i in range(df.shape[1]):
    
    # IF clause for OBJECT ocean_proximity data type
    if df.dtypes[i]=='O': 
        
        # Get all value names with for loop
        names = [str(_i) for _i in np.unique(df.iloc[:,i])] 
        
        # Format to join, replacee with commas, add to string with curly braces 
        names = ",".join(names)
        names = names.replace("[","")
        names = names.replace("]","")
        strings = " {" + names +"}"
        
        # Append to add2arff list
        add2arff.append("@attribute " + df.columns[i] + strings + "\n")
    
    # ELSE clause for all other FLOAT64 data types
    else:
        
        # Append to add2arff list with formatting
        add2arff.append("@attribute " + df.columns[i] + " NUMERIC\n")
        
# Append data header @data   
add2arff.append("\n@data\n")

# FOR loop for each instance
for i in range(df_clean.shape[0]):
    
    datastrings = ""
    
    # FOR loop for each feature
    for j in range(df.shape[1]):
        
            # IF clause to convert objects as strings
            if df_clean.dtypes[j]=='O':
                datastrings += "\"" + str(df_clean.iloc[i,j]) + "\""
            
            # ELSE clause for numerics
            else:
                datastrings += str(df_clean.iloc[i,j])
            
            # Add comma IF not last feature
            if j!=df_clean.shape[1]-1:
                datastrings += ","
    
    # ADD and APPEND            
    datastrings +="\n"
    add2arff.append(datastrings)
    
# Write the master list into the file and close
f.writelines(add2arff)
f.close()

# 6: WEKA accuracy results

## Classifier: Random Forest

### === Run information ===

Scheme:       weka.classifiers.trees.RandomForest -P 100 -I 100 -num-slots 1 -K 0 -M 1.0 -V 0.001 -S 1
Relation:     californiahousingdata
Instances:    20639
Attributes:   10
              longitude
              latitude
              housing_median_age
              total_rooms
              total_bedrooms
              population
              households
              median_income
              median_house_value
              ocean_proximity
Test mode:    10-fold cross-validation

### === Classifier model (full training set) ===

RandomForest

Bagging with 100 iterations and base learner

weka.classifiers.trees.RandomTree -K 0 -M 1.0 -V 0.001 -S 1 -do-not-check-capabilities

Time taken to build model: 5.53 seconds

### === Stratified cross-validation ===
### === Summary ===

Correctly Classified Instances       20258               98.154  %
Incorrectly Classified Instances       381                1.846  %
Kappa statistic                          0.9726
Mean absolute error                      0.0226
Root mean squared error                  0.0833
Relative absolute error                  8.3742 %
Root relative squared error             22.6904 %
Total Number of Instances            20639     

### === Detailed Accuracy By Class ===

                 TP Rate  FP Rate  Precision  Recall   F-Measure  MCC      ROC Area  PRC Area  Class
                 0.984    0.016    0.979      0.984    0.982      0.967    0.998     0.998     1H_OCEAN
                 0.992    0.004    0.991      0.992    0.991      0.987    1.000     1.000     INLAND
                 1.000    0.000    1.000      1.000    1.000      1.000    1.000     1.000     ISLAND
                 0.990    0.002    0.984      0.990    0.987      0.985    1.000     0.999     NEAR_BAY
                 0.942    0.005    0.963      0.942    0.952      0.945    0.997     0.989     NEAR_OCEAN
Weighted Avg.    0.982    0.010    0.981      0.982    0.981      0.973    0.999     0.997     

### === Confusion Matrix ===

    a    b    c    d    e   <-- classified as
 8989   56    0   12   79 |    a = 1H_OCEAN
   54 6496    0    1    0 |    b = INLAND
    0    0    5    0    0 |    c = ISLAND
    4    2    0 2265   18 |    d = NEAR_BAY
  131    1    0   23 2503 |    e = NEAR_OCEAN

## Regressor: Linear Regression

### === Run information ===

Scheme:       weka.classifiers.functions.LinearRegression -S 0 -R 1.0E-8 -num-decimal-places 4
Relation:     californiahousingdata
Instances:    20639
Attributes:   10
              longitude
              latitude
              housing_median_age
              total_rooms
              total_bedrooms
              population
              households
              median_income
              median_house_value
              ocean_proximity
Test mode:    10-fold cross-validation

### === Classifier model (full training set) ===


Linear Regression Model

housing_median_age =

     -1.5438 * longitude +
     -1.5796 * latitude +
     -0.0008 * total_rooms +
     -0.0071 * total_bedrooms +
      0.0006 * population +
     -2.047  * median_income +
      0      * median_house_value +
      1.9638 * ocean_proximity=1H_OCEAN,NEAR_OCEAN,NEAR_BAY,ISLAND +
     -1.1064 * ocean_proximity=NEAR_OCEAN,NEAR_BAY,ISLAND +
      8.5964 * ocean_proximity=NEAR_BAY,ISLAND +
    -93.8906

Time taken to build model: 0.24 seconds

### === Cross-validation ===
### === Summary ===

Correlation coefficient                  0.5124
Mean absolute error                      8.7859
Root mean squared error                 10.8073
Relative absolute error                 83.2631 %
Root relative squared error             85.8672 %
Total Number of Instances            20639 

# 7: RapidMiner Results

### Logistic Regression	

#### accuracy: 80.06% +/- 0.24% (micro average: 80.06%)

ConfusionMatrix:
True:	NEAR BAY	<1H OCEAN	INLAND	NEAR OCEAN	ISLAND
NEAR BAY:	523	145	7	137	0
<1H OCEAN:	114	2373	100	519	1
INLAND:	19	41	1758	37	0
NEAR OCEAN:	0	55	1	67	0
ISLAND:	0	0	0	0	0

#### classification_error: 19.94% +/- 0.24% (micro average: 19.94%)

ConfusionMatrix:
True:	NEAR BAY	<1H OCEAN	INLAND	NEAR OCEAN	ISLAND
NEAR BAY:	523	145	7	137	0
<1H OCEAN:	114	2373	100	519	1
INLAND:	19	41	1758	37	0
NEAR OCEAN:	0	55	1	67	0
ISLAND:	0	0	0	0	0

### Random Forest	

#### accuracy: 78.40% +/- 0.31% (micro average: 78.40%)
ConfusionMatrix:
True:	NEAR BAY	<1H OCEAN	INLAND	NEAR OCEAN	ISLAND
NEAR BAY:	652	28	10	139	0
<1H OCEAN:	0	2554	665	394	0
INLAND:	0	21	1197	5	0
NEAR OCEAN:	0	12	0	220	0
ISLAND:	0	0	0	0	0

#### classification_error: 21.60% +/- 0.31% (micro average: 21.60%)
ConfusionMatrix:
True:	NEAR BAY	<1H OCEAN	INLAND	NEAR OCEAN	ISLAND
NEAR BAY:	652	28	10	139	0
<1H OCEAN:	0	2554	665	394	0
INLAND:	0	21	1197	5	0
NEAR OCEAN:	0	12	0	220	0
ISLAND:	0	0	0	0	0
