In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [None]:
df=pd.read_csv("data/bluebook-for-bulldozers/TrainAndValid.csv",low_memory=False)

In [None]:
df.info()

In [None]:
fig, ax = plt.subplots()
ax.scatter(df["saledate"][:1000], df["SalePrice"][:1000]);

In [None]:
df.SalePrice.plot.hist();

In [None]:
df=pd.read_csv("data/bluebook-for-bulldozers/TrainAndValid.csv",
              low_memory=False,
              parse_dates=["saledate"])

In [None]:
df.saledate.dtype

In [None]:
df.saledate[:1000]

In [None]:
df.head()

In [None]:
df.head().T

In [None]:
df.sort_values(by=["saledate"],inplace=True,ascending=True)

In [None]:
df.saledate.head(20)

##Make a copy of the original DataFrame
Since we're going to be manipulating the data, we'll make a copy of the original DataFrame and perform our changes there.

This will keep the original DataFrame in tact if we need it again.

In [None]:
df_tmp=df.copy()

            ### Add datetime parameters for saledate colu

In [None]:
df_tmp["saleYear"] = df_tmp.saledate.dt.year
df_tmp["saleMonth"] = df_tmp.saledate.dt.month
df_tmp["saleDay"] = df_tmp.saledate.dt.day
df_tmp["saleDayofweek"] = df_tmp.saledate.dt.dayofweek
df_tmp["saleDayofyear"] = df_tmp.saledate.dt.dayofyear

# Drop original saledate
df_tmp.drop("saledate", axis=1, inplace=True)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=-1)
model.fit(df_tmp.drop("SalePrice", axis=1), df_tmp.SalePrice)

In [None]:
df_tmp.info()

In [None]:
df_tmp.isna().sum()

### convert all of the data in numbers

In [None]:
pd.api.types.is_string_dtype(df_tmp["UsageBand"])

In [None]:
df_tmp.head().T

In [None]:
# These columns contain strings
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
for label in df_tmp.columns:
    if df_tmp[label].dtype == 'object':
        df_tmp[label] = df_tmp[label].astype('category')


In [None]:
df_tmp.info()

In [None]:
df_tmp.state.cat.codes

In [None]:
df_tmp.isnull().sum()/len(df_tmp)

### Save Processed Data

In [None]:
df_tmp.to_csv("data/bluebook-for-bulldozers/train_tmp.csv",
              index=False)

In [None]:
df_tmp = pd.read_csv("data/bluebook-for-bulldozers/train_tmp.csv",
                     low_memory=False)
df_tmp.head()

## Fill missing values

In [None]:
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
# Check for which numeric columns have null values
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Add a binary column which tells if the data was missing our not
            df_tmp[label+"_is_missing"] = pd.isnull(content)
            # Fill missing numeric values with median since it's more robust than the mean
            df_tmp[label] = content.fillna(content.median())

In [None]:
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
df_tmp.auctioneerID_is_missing.value_counts()

In [None]:
# Check columns which *aren't* numeric
for label, content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
# Turn categorical variables into numbers
for label, content in df_tmp.items():
    
    if not pd.api.types.is_numeric_dtype(content):
       
        df_tmp[label+"_is_missing"] = pd.isnull(content)
      
        df_tmp[label] = pd.Categorical(content).codes+1    

In [None]:


df_tmp.info()



In [None]:


df_tmp.isna().sum()



In [None]:


df_tmp.head().T



In [None]:
%%time

model = RandomForestRegressor(n_jobs=-1,random_state=42)


model.fit(df_tmp.drop("SalePrice", axis=1), df_tmp.SalePrice)

In [None]:
model.score(df_tmp.drop("SalePrice", axis=1), df_tmp.SalePrice)

## Splitting data into train/valid sets

In [None]:


df_tmp.head()



In [None]:
df_tmp.saleYear.value_counts()

In [None]:
df_val = df_tmp[df_tmp.saleYear == 2012]
df_train = df_tmp[df_tmp.saleYear != 2012]

len(df_val), len(df_train)
X_train, y_train = df_train.drop("SalePrice", axis=1), df_train.SalePrice
X_valid, y_valid = df_val.drop("SalePrice", axis=1), df_val.SalePrice

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

## Building an evaluation function

In [None]:
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))


def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error(y_valid, val_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Valid RMSLE": rmsle(y_valid, val_preds),
              "Training R^2": model.score(X_train, y_train),
              "Valid R^2": model.score(X_valid, y_valid)}
    return scores

In [None]:
# This takes too long...

# %%time
# # Retrain a model on training data
# model.fit(X_train, y_train)
# show_scores(model)

In [None]:
model = RandomForestRegressor(n_jobs=-1,
                              max_samples=10000)

In [None]:
%%time

model.fit(X_train, y_train)

In [None]:
show_scores(model)

In [None]:
%%time
from sklearn.model_selection import RandomizedSearchCV

# Different RandomForestClassifier hyperparameters
rf_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1.0, "sqrt"], # Note: "max_features='auto'" is equivalent to "max_features=1.0", as of Scikit-Learn version 1.1
           "max_samples": [10000]}

rs_model = RandomizedSearchCV(RandomForestRegressor(),
                    
                              param_distributions=rf_grid,
                              n_iter=20,
                              cv=5,
                              verbose=True)

rs_model.fit(X_train, y_train)

In [None]:
rs_model.best_params_

In [None]:
show_scores(rs_model)

##  Train a model with the best parameters

In [None]:
%%time
# Most ideal hyperparameters
ideal_model = RandomForestRegressor(n_estimators=90,
                                    min_samples_leaf=1,
                                    min_samples_split=14,
                                    max_features=0.5,
                                    n_jobs=-1,
                                    max_samples=None)
ideal_model.fit(X_train, y_train)

In [None]:
show_scores(ideal_model)

## Make predictions on test data

In [None]:
df_test = pd.read_csv("data/bluebook-for-bulldozers/Test.csv",
                      parse_dates=["saledate"])
df_test.head()

In [None]:
def preprocess_data(df):
    # Add datetime parameters for saledate
    df["saleYear"] = df.saledate.dt.year
    df["saleMonth"] = df.saledate.dt.month
    df["saleDay"] = df.saledate.dt.day
    df["saleDayofweek"] = df.saledate.dt.dayofweek
    df["saleDayofyear"] = df.saledate.dt.dayofyear

    # Drop original saledate
    df.drop("saledate", axis=1, inplace=True)
    
    # Fill numeric rows with the median
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                df[label+"_is_missing"] = pd.isnull(content)
                df[label] = content.fillna(content.median())
                
        # Turn categorical variables into numbers
        if not pd.api.types.is_numeric_dtype(content):
            df[label+"_is_missing"] = pd.isnull(content)
            # We add the +1 because pandas encodes missing categories as -1
            df[label] = pd.Categorical(content).codes+1        
    
    return df

In [None]:
df_test = preprocess_data(df_test)
df_test.head()

In [None]:
set(X_train.columns) - set(df_test.columns)

In [None]:
df_test["auctioneerID_is_missing"] = False
df_test.head()

In [None]:
test_preds = ideal_model.predict(df_test)