# Bulldozers price prediction

### Imports

In [1]:
%config Completer.use_jedi = False
%matplotlib inline

In [29]:
import pandas as pd
import numpy as np
from fastai.tabular.all import add_datepart, cont_cat_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score 

#### Function to display more rows and columns

In [3]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

### Loading datasets

In [4]:
train = pd.read_csv('../input/bluebook-for-bulldozers/Train.zip', compression='zip', low_memory=False, parse_dates=['saledate'])
valid = pd.read_csv('../input/bluebook-for-bulldozers/Valid.csv', low_memory=False, parse_dates=['saledate'])
y_valid = pd.read_csv('../input/bluebook-for-bulldozers/ValidSolution.csv', usecols=['SalePrice'])

In [7]:
display_all(train.head().T)

In [6]:
train.shape

### Storing the SalePrice column, that is the one we want to predict, in a separate variable and removing it from the train dataset

In [8]:
y_train = train.SalePrice
train.drop('SalePrice', axis=1, inplace=True)

In [11]:
# Function to avoid repiting code
def per_col(df, funct, apply=False):
    for col in df.columns:
        if apply:
            df[col] = funct(df[col])
        else:
            funct(df[col])
    return df if apply else None

Quick overview of the values in each column

In [12]:
per_col(train, lambda x: print(x.value_counts(dropna=False)))

## Replacing 'None or Unspecified' values for Nan 

In [13]:
none_unsp_string = [col for col in train.columns if 'None or Unspecified' in train[col].value_counts()]

train[train[none_unsp_string] == 'None or Unspecified'] = np.nan
valid[valid[none_unsp_string] == 'None or Unspecified'] = np.nan

In [14]:
for i in none_unsp_string:
    print(train[i].value_counts(dropna=False))

### Dropping columns with more than 80% of null values in the training set

In [15]:
n_rows = train.shape[0] * 0.2
full_cols = train.columns
train.dropna(axis=1, thresh=n_rows, inplace=True)

In [16]:
# Dropping the same columns in the validation and test set
dropped_cols = list(set(full_cols)- set(train.columns))
valid.drop(dropped_cols, axis=1, inplace=True)

Remaining columns

In [17]:
#From 53 to 24 columns
print(train.columns, len(train.columns), sep='\n')

In [18]:
print(valid.columns, len(valid.columns), sep='\n')

In [19]:
train.head().T

In [20]:
# Checking the percentage of null values per col
per_col(train, lambda x: print(x.isnull().sum()/train.shape[0]))

## Cleaning Categorical Columns
* Date Column: \
Extracting information from the saledate column with the add_datepart function, creating columns like saledate_year, saledate_month, etc. After this process, the saledate column is dropped, so now we dont have any column with date data type.

In [21]:
add_datepart(train, 'saledate')
add_datepart(valid, 'saledate')

In [22]:
def null_columns(df):
    '''
    Create a column_name_is_null column
    to indicate before imputing and encoding
    that that row had a null value
    '''
    for col in df.columns:
            df[col + '_isnull'] = df[col].isnull()
    return df

In [23]:
train = null_columns(train)
train.columns

In [24]:
valid = null_columns(valid)

### Creating Imputers

In [25]:
# Imputer for continous values replacing with mean
cont_imputer = SimpleImputer(strategy='mean')

# Imputing categorical values with most common value
cat_imputer = SimpleImputer(strategy='most_frequent')

### Defining an encoder to the categorical values

In [26]:
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

### Creating a pipeline with all the preprocessing of categorical columns

In [27]:
cat_preprocessor = Pipeline(steps=[
    ('imputer', cat_imputer),
    ('encoder', ordinal_encoder)
])

### Getting categorical and continous columns

In [30]:
cont_cols, cat_cols = cont_cat_split(train)
print('Continuosu columns: ', cont_cols, end='\n\n')
print('Categorical columns: ', cat_cols)
train.shape

### Applying Imputers to DataFrame

In [31]:
# Combining the imputers into a column transformer and specifiying
# the columns that each of them need to targer
# The n_jobs=-1 will use all the cpu cores available
preprocessing = ColumnTransformer(n_jobs=-1,
    transformers=[
    ('cont_imp', cont_imputer, cont_cols),
    ('cat_imp', cat_preprocessor, cat_cols)
])

### Defining normalizer to scale the data between 0 and 1
This may avoid that the model give evaluates one column more important than other, though is not so important in a Random Forest model, that is the one im going to use

In [32]:
# Default range of normalization is 0-1
normalizer = MinMaxScaler()

### RandomForestRegressor as machine learning model

In [33]:
model = RandomForestRegressor(n_jobs=-1)

### Defining the main pipeline with the transformation, normalization and model

In [34]:
main_pipeline = Pipeline(steps=[
    ('column_transformer', preprocessing),
    ('normalizer', normalizer),
    ('model', model)
])
main_pipeline.fit(train, y_train)

### Making the predictions to the validation set

In [35]:
predictions = main_pipeline.predict(valid)

### Evaluating the model (R^2)

In [36]:
main_pipeline.score(valid, y_valid)

### Conclusion: The evaluation of the model gives a coefficient of determination of 0.87, wich is a good result and indicates a good fit of the regression model