In [46]:
# importing libraries
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
import seaborn as sns

In [40]:
# reading data with house pricing
train = pd.read_csv('Data/train.csv', index_col='Id')
test = pd.read_csv('Data/test.csv', index_col='Id') # used at the end to evaluation model efficiency

In [45]:
# examining data
# print('SalePrice' in train.columns, 'SalePrice' in test.columns, sep='\n')
train.shape

(1460, 80)

# Setting target and features
### Splitting dataset for training and test purposes

In [43]:
# setting 'Price' column as target for predctions
y = train.SalePrice

#features preparation
features = train.drop(['SalePrice'], axis=1) # remove 'SalePrice' column

# only numerical predicators
X = features.select_dtypes(exclude=['object'])

# divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [44]:
# evaluating which columns has missing values (from numerical features)
cols_with_missing = [col for col in X_train_full if X_train_full[col].isnull().any()]
cols_with_missing

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [None]:
# define function for column imputation (missing values)
def imputation_dataset(X_train, X_valid, X_test):
    imputer = SimpleImputer()
    
    imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))

In [None]:
# imputing columns with missing values

In [13]:
# drop columns with missing values from training and test datasets
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

In [22]:
X_train_full.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


### Evaluating cardinality in categorical values

In [17]:
# retrieving columns with low cardinality
low_cardinality_cols = [col for col in X_train_full.columns if 
                        X_train_full[col].nunique() < 10 and
                        X_train_full[col].dtype == "object"]
low_cardinality_cols

[]

In [18]:
#
num_cols = [col for col in X_train_full if X_train_full[col].dtype in 
            ['int64', 'float64']]
num_cols

9

In [19]:
correct_cols = low_cardinality_cols + num_cols
correct_cols

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Landsize',
 'Lattitude',
 'Longtitude',
 'Propertycount']