# Fast.ai Machine Learning - Course 1, Lesson 1

## 0 - Imports

In [None]:
import pandas as pd
import numpy as np
import os
from pandas.api.types import is_string_dtype, is_object_dtype

## 1 - VSCode Import

This section only needs to be run once.

Downloading data from Kaggle requires us to:
1. Install Kaggle
2. Create a Kaggle folder in our home directory (it'll be hidden)
3. Get our API credentials from the Kaggle 'Settings' page
4. Place the credentials (.json file) in the Kaggle folder from step 2

In [None]:
# change current working directory
os.chdir('..')
print(f'cwd: {os.getcwd()}')

In [None]:
!pip install -q kaggle

In [None]:
# create a kaggle directory
dir = os.path.expanduser('~/.kaggle')
os.makedirs(dir, exist_ok=True)

In [None]:
# copy credentials to kaggle folder
creds = '/Users/chelseatucker/credentials/kaggle.json'
!cp $creds ~/.kaggle

# change permissions so only I have read & write access to the credentials file
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# create a bulldozers directory
os.makedirs('data/bbfb', exist_ok=True)

# downloading the bulldozers dataset to the 'data' folder
!kaggle competitions download -c bluebook-for-bulldozers -p 'data/bbfb'

In [None]:
# # unzip the data
!unzip -q data/bbfb/bluebook-for-bulldozers.zip -d 'data/bbfb'

# unzip train data
!unzip -q data/bbfb/Train.zip -d 'data/bbfb'

## 2 - Colab Import

This section will need to be run everytime this notebook is run on Colab.

In [None]:
!pip install -q kaggle

In [None]:
# upload the 'Kaggle.json' file
from google.colab import files
files.upload()

In [None]:
# make a kaggle directory and move the json file there
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle

In [None]:
# change permissions on the API file so it isn't readable by other users
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# download dataset from Kaggle
!kaggle competitions download -c 'bluebook-for-bulldozers'

In [None]:
# move dataset
!mkdir data
!mv bluebook-for-bulldozers.zip /content/data

In [None]:
# unzip data
!unzip data/bluebook-for-bulldozers.zip -d data/bbfb

In [None]:
# unzip train data
!unzip data/bbfb/Train.zip -d data/bbfb

## 2 - Exploring the Data

In [None]:
df_raw = pd.read_csv('data/bbfb/Train.csv',
                     low_memory=False,
                     parse_dates=['saledate'])

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# quick look to see if the data has imported correctly
df_raw.head(3)

In [None]:
from src.eda import df_look

# high level overview of the data
df_look(df_raw)

## 3 - Feature Engineering

### 3.1 - Data Edits

The competiton wants us to use RMSLE as the measure between actuals and predictions so we'll take the log of the dependent variable.

In [None]:
df_raw.SalePrice = np.log(df_raw.SalePrice)

Individual edits:

In [None]:
# looking at individual levels
df_raw.Undercarriage_Pad_Width.value_counts(dropna=False).sort_index()

In [None]:
## edits
# Blade Width
df_raw.Blade_Width = df_raw.Blade_Width.str.replace("'","")
df_raw.Blade_Width = df_raw.Blade_Width.str.replace("<12","11")
df_raw.Blade_Width = df_raw.Blade_Width.str.replace('None or Unspecified', '-1')
df_raw.Blade_Width.fillna('-2', inplace=True)
df_raw.Blade_Width = df_raw.Blade_Width.astype(int)

# Type Size
df_raw['Tyre_Size'] = df_raw['Tire_Size']
df_raw.Tyre_Size = df_raw.Tyre_Size.str.replace('"','')
df_raw.Tyre_Size = df_raw.Tyre_Size.str.replace(' inch','')
df_raw.Tyre_Size = df_raw.Tyre_Size.str.replace('None or Unspecified', '-1')
df_raw.Tyre_Size.fillna('-2', inplace=True)
df_raw.Tyre_Size = df_raw.Tyre_Size.astype(float)
df_raw.drop(['Tire_Size'], axis=1, inplace=True)

# Undercarriage Pad Width
df_raw.Undercarriage_Pad_Width = df_raw.Undercarriage_Pad_Width.str.replace(' inch', '')
df_raw.Undercarriage_Pad_Width = df_raw.Undercarriage_Pad_Width.str.replace('None or Unspecified', '-1')
df_raw.Undercarriage_Pad_Width.fillna('-2', inplace=True)
df_raw.Undercarriage_Pad_Width = df_raw.Undercarriage_Pad_Width.astype(float)
df_raw.Undercarriage_Pad_Width = df_raw.Undercarriage_Pad_Width.round(0)
df_raw.Undercarriage_Pad_Width = df_raw.Undercarriage_Pad_Width.astype(int)

# Stick_Length
df_raw.Stick_Length = df_raw.Stick_Length.str.replace("' ", ".")
df_raw.Stick_Length = df_raw.Stick_Length.str.replace('"', '')
df_raw.Stick_Length = df_raw.Stick_Length.str.replace('None or Unspecified', '-1')
df_raw.Stick_Length.fillna('-2', inplace=True)
df_raw.Stick_Length = df_raw.Stick_Length.astype(float)

### 3.x - Feature Creation

Extracting more information from date/time columns:

In [None]:
from src.preprocessing import add_dateattr

# extracting more information from the date field
add_dateattr(df_raw, 'saledate')

### 3.x - Categorical Features

Converting string/object features to categorical features and then overwriting the categories with their respective codes

In [None]:
# list object/string columns
cats = []
for c in df_raw.columns:
    if is_string_dtype(df_raw[c]) or is_object_dtype(df_raw[c]):
        cats.append(c)

cats

In [None]:
from src.preprocessing import conv_to_cat

# converting all string/object columns to categories
conv_to_cat(df_raw)

In [None]:
# checking category orders
for c in cats:
    print(c,':',df_raw[c].cat.categories)
    print()

In [None]:
# reordering categories
df_raw.UsageBand = df_raw.UsageBand.cat.reorder_categories(['Low', 'Medium', 'High'], ordered=True)
df_raw.ProductSize = df_raw.ProductSize.cat.reorder_categories(['Mini', 'Small', 'Compact', 'Medium', 'Large / Medium', 'Large'], ordered=True)
df_raw.Drive_System = df_raw.Drive_System.cat.reorder_categories(['No', 'Two Wheel Drive', 'Four Wheel Drive', 'All Wheel Drive'], ordered=True)
df_raw.Grouser_Type = df_raw.Grouser_Type.cat.reorder_categories(['Single', 'Double', 'Triple'], ordered=True)

# checking after reordering
df_raw.UsageBand.cat.categories

In [None]:
from src.preprocessing import numericalise

# converting all categorical columns to their code equivalents
for c in cats:
    numericalise(df_raw, df_raw[c], f'{c}_codes', max_n_cat=100)

In [None]:
# checking code column
print(df_raw.UsageBand.value_counts(dropna=False).sort_index())
print(df_raw.UsageBand_codes.value_counts(dropna=False).sort_index())

In [None]:
# checking non-numericalised categorical columns

num = [col for col in df_raw.columns if '_codes' in col]