# Final ML Project
## Daniel Bernal, Raymond Vuong, Rohit Punjani, and Neal Davar 


In [8]:
# import statements

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor


In [17]:
# Constants and other global variables: 

IL_F_ITERATIONS = 50

In [9]:
# clean data by dropping cols like Id, imputing value
# depending on the feature, and dropping duplicates:
def clean_data(training): 
    print(training.shape)
    training.head()
    null_counts = training.isnull().sum()
    missing_features = null_counts[null_counts > 0]
    print(missing_features)
    
    training.drop('Id', axis=1, inplace=True)
    # fillna on the basis of whether MasVnrArea, LotFrontage, or any other categorical feature with na values
    training['MasVnrArea'].fillna(0, inplace=True)
    training['LotFrontage'].fillna(0, inplace=True)
    training['GarageYrBlt'].fillna(training['GarageYrBlt'].mean(), inplace=True)
    training.fillna('None', inplace=True)

    # drop duplicates:
    training.drop_duplicates(keep=False, inplace=True)

    # Changing categorial features to be stored as string
    training['MSSubClass'] = training['MSSubClass'].astype(str)    

    # one hot encoding categorical variables:
    training = pd.get_dummies(training, columns=['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'], drop_first=True)

    return training

In [10]:
# Uses the isolation forest technique to find outliers across 50 iterations
# and removes records that are recorded as outliers > 10% of the time
def run_isolation_forest(training):
    # use isolation forests to find potential outliers:
    freq_outlier_map = {} 
    for i in range(0, IL_F_ITERATIONS):
        anomalies = IsolationForest().fit_predict(training, 0.5)
        training['anomalies'] = anomalies
        outlier_indices = training.loc[training['anomalies'] == -1].index
        # add outlier freqs to map
        for j in range(0, len(outlier_indices)):
            count = 0
            if outlier_indices[j] in freq_outlier_map: 
                count = freq_outlier_map[outlier_indices[j]]
            freq_outlier_map[outlier_indices[j]] = count + 1
        
        inlier_indices = training.loc[training['anomalies'] == 1].index

        
    # drop outliers that are detected as anomalies more than 10% of the time
    final_outlier_indices = []
    print('Total # of Outliers: ')
    for outlier_index in freq_outlier_map.keys(): 
        if freq_outlier_map[outlier_index] > (0.10 * IL_F_ITERATIONS):
            final_outlier_indices.append(outlier_index)
    print(len(final_outlier_indices))
    print('Number of data points before outlier removal: ')
    print(len(training))
    training.drop(index=final_outlier_indices, inplace=True)
    print('Number of data points before after outlier removal: ')
    print(len(training))


In [11]:
# function that normalizes the training and testing data:
def normalize_data(train, test):
    # your code goes here
    train_norm = (train - train.min()) / (train.max() - train.min())
    test_norm = (test - test.min()) / (test.max() - test.min())
    return train_norm, test_norm

In [12]:
# performs feature engineering by consolidating features,
# dropping unnecessary features, and more
def feature_engineering(training, testing):
    # consolidate bathroom columns into one col:
    training['TotalBathrooms'] = \
        training['FullBath'] + (0.5 * training['HalfBath']) + \
        training['BsmtFullBath'] + (0.5 * training['BsmtHalfBath'])


    training.drop(['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath'], axis=1)

    # consolidate porch area into one column
    training['TotalPorchSF'] = training['OpenPorchSF'] + \
        training['EnclosedPorch'] + \
        training['3SsnPorch'] + training['ScreenPorch']

    training.drop(['OpenPorchSF', 'EnclosedPorch',
                '3SsnPorch', '3SsnPorch'], axis=1)

 

In [13]:
# Run Decision Tree Regression on our Data: 
def do_decision_tree_regression(training):
  labels = training['SalePrice']
  features = training.drop('SalePrice', axis=1)
  x_train, x_test, y_train, y_test = train_test_split(
      features, labels, test_size=0.20)
  dt = DecisionTreeRegressor(random_state=2)
  dt.fit(x_train, y_train)
  # Prediction = np.zeros((len(y_test), 1))
  Prediction = dt.predict(x_test)
  print("R2 Score No Crossval")
  print(metrics.r2_score(y_test, Prediction))
  print("MSE")
  print(metrics.mean_squared_error(y_test, Prediction))

  #outerloop of crossval
  cv = cross_val_score(dt, features, labels, cv=10)
  r2 = sum(cv) / cv.size
  # print(r2)
  #innerloop of crossval
  parameters = {'max_depth': [5, 10, 15, 20, 30, 40], 'min_samples_leaf': [
      5, 10, 15, 20, 30, 40], 'max_features': [5, 10, 15, 30, 40]}
  grid = GridSearchCV(DecisionTreeRegressor(),
                      param_grid=parameters, cv=10, scoring='r2')
  grid.fit(features, labels)
  print("Best parameters")
  print(grid.best_params_)
  # print(grid.best_score_)

  #combined
  cv = cross_val_score(grid, features, labels, cv=5)
  r2 = sum(cv)/cv.size
  print("R2 with cross val")
  print(r2)

In [14]:
# run a linear regression model 
def runLinReg(train_norm):
    labels = train_norm['SalePrice']
    labels.values.ravel()
    features = train_norm.drop('SalePrice', axis=1)
    print(features.shape)
    print(labels.shape)
    print(features.head())
    print(labels.head())


    linReg = LinearRegression()
    scores = cross_val_score(linReg, features, labels, cv=10)
    print("Accuracy:", scores.mean()*100)


In [18]:

training = pd.read_csv("train.csv")
testing = pd.read_csv("housing_testing.csv")

# 1. Data Cleaning
training = clean_data(training)
print(training.head())


#2. Data Cleaning Pt. 2
run_isolation_forest(training)


# 2. Normalization
train_norm, test_norm = normalize_data(training, testing)

#3. Data Exploration: 






(1460, 81)
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64
   LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  YearRemodAdd  \
0         65.0     8450            7            5       2003          2003   
1         80.0     9600            6            8       1976          1976   
2         68.0    11250            7            5       2001          2002   
3         60.0     9550            7            5       1915          1970   
4         84.0    14260            8            5       2000          2000   

   MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  ...  SaleType_ConLI  \
0       196.0         706          