### import libraries

In [None]:
import matplotlib 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import sklearn.linear_model as linear_model

### datapath define

In [None]:
import os
datapath = os.path.join("../input/house-prices-advanced-regression-techniques/")


### import data

In [None]:
# oecd_bli = pd.read_csv(datapath + "oecd_bli_2015.csv", thousands=",");
# gdp_data =  pd.read_csv(datapath + "gdp_per_capita.csv", thousands=",", delimiter='\t', encoding='latin1', na_values="n/a");
housing = pd.read_csv(datapath + "train.csv");
testdf = pd.read_csv(datapath + "test.csv");
sampleSubmissiondf = pd.read_csv(datapath + "sample_submission.csv");

### load the data

In [None]:
pd.set_option('display.max_columns', None)  

print(housing.head())
print(housing.describe())
print(housing.info())

### Build a quick Baseline

In [None]:
from sklearn.ensemble import RandomForestRegressor


# Create a copy to work with
X = housing.copy()

# Save and drop labels
y = housing.SalePrice
X = X.drop('SalePrice', axis=1)

# fill NANs
X = X.fillna(-999)

# Label Encoder
for c in housing.columns[housing.dtypes == 'object']:
  X[c] = X[c].factorize()[0]

rf = RandomForestRegressor()
rf.fit(X,y)

In [None]:
rf.feature_importances_

In [None]:
plt.figure(figsize=(20,10))
plt.grid(True)
plt.plot(rf.feature_importances_)
plt.xticks(np.arange(X.shape[1]), X.columns.tolist(), rotation=90)

### plot data

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
X.hist(bins=50, figsize=(20,15))
plt.show()

### Utitly to print variance and mean

In [None]:
def print_var(column_names_low, housing1):
  for column in column_names_low:
    columnSeriesObj = housing1[column]
    print(columnSeriesObj.value_counts())
    print(columnSeriesObj.count())
    if columnSeriesObj.dtype  !='object':
      print(columnSeriesObj.mean())
      print(columnSeriesObj.var())
    else:
      print("Type:",columnSeriesObj.dtype)
  

### explore some features


In [None]:
column_names_low = ["Functional", "Alley", "Fence", "ExterCond", "PoolQC","Utilities","MiscFeature","Id","SalePrice"]

column_names_high = ["OverallQual", "GrLivArea", "2ndFlrSF","1stFlrSF","TotalBsmtSF","BsmtFinSF1","MasVnrArea","YearBuilt","YearRemodAdd","Neighborhood","LotArea","LotFrontage" ]
print_var(column_names_high, housing)
print_var(column_names_high, testdf)


In [None]:
# explore test data
# fill NANs
test = testdf.fillna(-999)

# Label Encoder
for c in testdf.columns[testdf.dtypes == 'object']:
  test[c] = test[c].factorize()[0]

%matplotlib inline
import matplotlib.pyplot as plt
test.hist(bins=50, figsize=(20,15))
plt.show()

### explore if columns donot have variance

In [None]:
cols = test.select_dtypes([np.number]).columns
std = test[cols].std()
cols_to_drop = std[abs(std)<0.2].index
# print(cols_to_drop)
print_var(cols_to_drop, test)

#### Test random forest

In [None]:
from sklearn.model_selection import cross_val_score

scores_rf = cross_val_score(rf, X, y, scoring="neg_mean_squared_error", cv=10)
print(scores_rf)

### split the data into training and test

In [None]:
import numpy as np

np.random.seed(42)

def split_train_test(data, test_ratio):
  shuffled_indices = np.random.permutation(len(data))
  test_set_size = int(len(data)*test_ratio)
  test_indices = shuffled_indices[:test_set_size]
  train_indices = shuffled_indices[test_set_size:]
  print(shuffled_indices)
  print(test_indices)
  print(train_indices)
  return data.iloc[train_indices], data.iloc[test_indices]

### call data splitter

In [None]:
train_set, test_set = split_train_test(housing,0.2)
print(len(train_set), "train + ", len(test_set), "test")

#### scikit train test split

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

### Looking for correlations

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["OverallQual", "GrLivArea", "2ndFlrSF", "1stFlrSF"]

scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
corr_matrix = housing.corr()

### convert to clean training set

In [None]:

train_features = train_set.drop("SalePrice", axis=1)
train_labels = train_set["SalePrice"].copy()

test_features = test_set.drop("SalePrice", axis=1)
test_labels = test_set["SalePrice"].copy()

In [None]:
train_set.head()

## Data Cleaning

### Transformation Pipelines

### DataFrameSelector class

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer 

class DataFrameSelector(BaseEstimator, TransformerMixin):
  def __init__(self, attribute_names):
    self.attribute_names = attribute_names
  def fit(self,X,y=None):
    return self
  def transform(self,X):
    return X[self.attribute_names].values

### Custom One HotEncoder 

In [None]:
class CustomEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, n_values='auto'):
        self.n_values = n_values

    def transform(self, X):
        ohe = pd.get_dummies(list(X),  dummy_na=True)
        return ohe

    def fit(self, X, y=None):
        return self

Custom LabelBinarizer class

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor


cat_features=list(set([i for i in train_features.columns if train_features.dtypes[i]=='object']) - set(column_names_low))
num_features = list(set(train_features._get_numeric_data().columns) - set(column_names_low))

# cat_features = ["Neighborhood"]



# num_features=['OverallQual',
#  'GrLivArea',
#  '2ndFlrSF',
#  '1stFlrSF',
#  'TotalBsmtSF',
#  'BsmtFinSF1',
#  'MasVnrArea','YearBuilt','YearRemodAdd','LotArea','LotFrontage']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer (strategy='median')),
    ('std_scaler', StandardScaler())
])

categorical_transformer = Pipeline([
     ('imputer', SimpleImputer (strategy='most_frequent')),
    ('labelBinarizer',OneHotEncoder(sparse=False,handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)])

fullpipeline = Pipeline([
    ("preprocessor",preprocessor),
])

   ### Random Hyperparameter Grid

In [None]:

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())


from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

### Random Search Training


In [None]:
train_features_transformed = fullpipeline.fit_transform(train_features)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features_transformed, train_labels)

# Select and Train the model

### Best Model

In [None]:
rf_random.best_estimator_
train_features_transformed = fullpipeline.fit_transform(train_features)


In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(train_features_transformed, train_labels)
test_features_transformed = fullpipeline.transform(test_features) 
base_accuracy = evaluate(base_model, test_features_transformed, test_labels)
               
               
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_features_transformed, test_labels)
               
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

In [None]:
# rmse_rf = np.sqrt(-scores_rf)
# display_scores(rmse_rf)

# generate submission like sample submission

In [None]:
sampleSubmissiondf = pd.read_csv(datapath + "sample_submission.csv");
print(sampleSubmissiondf.head())
testdf = pd.read_csv(datapath + "test.csv");
testdf.head()


In [None]:
IDArr = testdf['Id'].values
clf = best_random
clf.fit(train_features_transformed, train_labels)
final_test_predictions = clf.predict(fullpipeline.transform(testdf))


#### generate CSV

In [None]:
df = pd.DataFrame({'Id':IDArr,'SalePrice': final_test_predictions})
# df.to_csv(index=False)

df.to_csv(r'results.csv',index=False)

In [None]:
df.describe()