### Data Cleaning | Exploration | Visualization

In [None]:
import pandas as pd
import numpy as np

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import matplotlib.ticker as ticker
%matplotlib inline

In [None]:
import plotly.express as px
import plotly.graph_objects as go

In [None]:
# using the auto dataset
X = pd.read_csv('data/auto.csv')
X.shape

In [None]:
X.head()

In [None]:
# quick check for missing values
# if only a handful - you could just drop them
print(f'total missing values are: {X.isna().sum().sum()}')

In [None]:
# Drop rows with missing values - if you want...
# alternatively you may interpolate or use some other method for filling things in
X.dropna(axis=0, inplace=True) 

### Visual Exploration

In [None]:
bins = np.linspace(X.price.min(), X.price.max(), 5)
g = sns.FacetGrid(X, col="make", hue="horsepower-binned", palette="Set2", col_wrap=4)
g.map(plt.hist, 'price', bins=bins, ec="k")

g.axes[-1].legend()
plt.show()

In [None]:
bins = np.linspace(X.price.min(), X.price.max(), 5)
g = sns.FacetGrid(X, col="make", hue="num-of-doors", palette="Set2", col_wrap=4)
g.map(plt.hist, 'price', bins=bins, ec="k")

g.axes[-1].legend()
plt.show()

In [None]:
bins = np.linspace(X.price.min(), X.price.max(), 5)
g = sns.FacetGrid(X, col="make", hue="body-style", palette="Set2", col_wrap=4)
g.map(plt.hist, 'price', bins=bins, ec="k")

g.axes[-1].legend()
plt.show()

In [None]:
# First we create a figure using go.Figure and adding trace to it through go.scatter
fig = go.Figure(data=go.Scatter(x=X['make'], y=X['price'], mode='markers', marker=dict(color='red')))
# Updating layout through `update_layout`. Here we are adding title to the plot and providing title to x and y axis.
fig.update_layout(title='Price vs Model', xaxis_title='make', yaxis_title='price')
# Display the figure
fig.show()

In [None]:
bub_data = X.groupby('make')['price'].sum().reset_index()
# Create bubble chart here
fig = px.scatter(bub_data, x="make", y="price", size="price",
                 hover_name="make", title='Make and Price', size_max=60)
fig.show()

In [None]:
fig = px.histogram(X, x="price")
fig.show()

In [65]:
from ipywidgets import interact
%matplotlib notebook

def select_car(make_of_car):
    df_filtered= X.loc[X['make'] == make_of_car] 
    ax = df_filtered[["horsepower-binned", "price"]].boxplot( by="horsepower-binned", return_type='axes')
    ax["price"].set_title("make " + make_of_car)
    print(df_filtered)

makes = [m for m in X['make'].unique()]

interact(select_car, make_of_car=makes)

interactive(children=(Dropdown(description='make_of_car', options=('alfa-romero', 'audi', 'bmw', 'chevrolet', …

<function __main__.select_car(make_of_car)>

### Dealing with dates

In [None]:
#df['due_date'] = pd.to_datetime(df['due_date'])
#df['effective_date'] = pd.to_datetime(df['effective_date'])
#df.head()

### Finding Categorical and Numerical Columns

Looking at the data set - should categorical columns be encoded first or only after splitting?

The general consensus is that it should be encoded after splitting, but for a divergent opinion:
https://jamesmccaffrey.wordpress.com/2020/05/27/should-you-normalize-and-encode-data-before-train-test-splitting-or-after-splitting/

In [None]:
# first it's helpful to see what columns contain what 
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X.columns if X[cname].nunique() < 3 and 
                        X[cname].dtype == "object"]
low_cardinality_cols

In [None]:
# Select numerical columns
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

In [None]:
# Get list of categorical variables
s = (X.dtypes == 'object')
object_cols = list(s[s].index)

In [None]:
print("Categorical variables are in columns:")
print(object_cols)
print('Numerical variables are in columns:')
print(numerical_cols)

In [None]:
# print unique values in the categorical columns
# NB - if there is a test set - compare the values in the test and training sets to
# ensure that the intersection of both sets is complete otherwise the encoding steps
# will throw an error
for o in object_cols:
    print(f'column header "{o}" contains these unique values...')
    print(X[o].unique())
    print()

In the categorical columns values is there an inherent ranking present?

### ordinal encoding

In [None]:
# remap values with a dictionary manually with replace
X.replace({"num-of-doors":{'two':2, 'four':4},
           "num-of-cylinders":{'four':4, 'six':6, 'five':5, 'three':3, 'twelve':12, 'two':2, 'eight':8})

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html

and since it is a blind spot in the documentation - for a clear explanation on how to use categories
https://datascience.stackexchange.com/questions/72343/encoding-with-ordinalencoder-how-to-give-levels-as-user-input

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# for one column using categories
#ordinal_encoder = OrdinalEncoder(categories=[['two','four']])
#X_copy = ordinal_encoder.fit_transform(X.loc[:,["num-of-doors"]])

# for using multiple - put the labels in order in the lists
door_cats = ['two', 'four']
cylinder_cats = ['two','three','four', 'five', 'six','eight','twelve']
horse_cats = ['Low', 'Medium', 'High']
asperation_cats = ['std','turbo']

# and then feed them to the encoder class and use the fit_transform method
ordinal_encoder = OrdinalEncoder(categories=[door_cats,cylinder_cats,horse_cats,asperation_cats])
X[['num-of-doors', 'num-of-cylinders','horsepower-binned','aspiration']] = ordinal_encoder.fit_transform(X[['num-of-doors', 
                                                                           'num-of-cylinders',
                                                                           'horsepower-binned',
                                                                            'aspiration']])

# or to let the ordinal encoder lable things automatically...
#label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])


In [None]:
X[['num-of-doors', 'num-of-cylinders','horsepower-binned','aspiration']]

In [None]:
# to retrieve the original values
ordinal_encoder.inverse_transform(X[['num-of-doors', 'num-of-cylinders','horsepower-binned','aspiration']])

### separate X and y variables

In [None]:
y = X.price
X.drop(['price'], axis=1, inplace=True) 
# if assigning to another variable remove the inplace
# X = X_train.drop(['price'], axis=1)

In [None]:
# double check to make sure nothing is missing
y_miss = y.isna().sum().sum()
x_miss = X.isna().sum().sum()
print(f'X missing values: {y_miss} Y missing values: {x_miss}')

In [None]:
X.head()

In [None]:
for n in numerical_cols:
    print('column header', n, 'is numerical and has these stats:')
    print('mean', X[n].mean())
    print('median', X[n].median())
    print('std deviation', X[n].std())
    print()

In [None]:
# for a deeper summary use .describe()
X.describe()

In [None]:
# filter out all categorical variables from the dataset
#drop_X = X.select_dtypes(exclude=['object'])

### Quantifying Missing values

In [None]:
num_rows = X.shape[0]
num_columns = X.shape[1]
print('df is: ',num_rows, 'by', num_columns)

In [None]:
# How many columns in the training data
# have missing values?
missing_count = (X.isnull().sum())
num_cols_with_missing = missing_count[missing_count > 0].count()

# Fill in the line below: How many missing entries are contained in 
# all of the training data?
tot_missing = X.isna().sum().sum()
print(f'num_rows: {num_rows} num_columns: {num_columns}')
print(f'number of columns with missing values: {num_cols_with_missing}')
print(f'total number of missing values: {tot_missing}')
print()
print('columns with missing values + count of missing')
print(missing_count)

In [None]:
# Get names of columns with missing values
cols_with_missing = [col for col in X.columns
                     if X[col].isnull().any()]
cols_with_missing

### Removing Missing Values

https://pandas.pydata.org/docs/user_guide/missing_data.html#missing-data

In [None]:
# Drop columns in training and validation data with axis=1 
reduced_X = X.drop(cols_with_missing, axis=1)
reduced_X

# alternate
# reduced_X_train = X_train.dropna(how='any') #'all' = only drop if all of a row or column is na

In [None]:
# drop rows in training and validation data with missing values
X.dropna(axis=0, inplace=True)

### Move to other sheet

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

In [None]:
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

### Cross Validation

We obtain the cross-validation scores with the cross_val_score() function from scikit-learn. We set the number of folds with the cv parameter.

The scoring parameter chooses a measure of model quality to report: in this case, we chose negative mean absolute error (MAE). The docs for scikit-learn show a list of options.

It is a little surprising that we specify negative MAE. Scikit-learn has a convention where all metrics are defined so a high number is better. Using negatives here allows them to be consistent with that convention, though negative MAE is almost unheard of elsewhere.

We typically want a single measure of model quality to compare alternative models. So we take the average across experiments.

In [None]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

In [None]:
print("Average MAE score (across experiments):")
print(scores.mean())

In [None]:
def get_score(n_estimators):
    """Return the average MAE over 3 CV folds of random forest model.
    
    Keyword argument:
    n_estimators -- the number of trees in the forest
    """
    # Replace this body with your own code
    a_pipeline = Pipeline(steps=[
    ('preprocessor', SimpleImputer()),
    ('model', RandomForestRegressor(n_estimators=n_estimators, random_state=0))])
    a_score = -1 * cross_val_score(a_pipeline, X, y,
                              cv=3,
                              scoring='neg_mean_absolute_error')
    return a_score.mean()

In [None]:
# score models with different numbers of esitmators, then plot the scores and look for the elbow
results = {}
for n in range(50, 450, 50):
    results[n] = get_score(n)

import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(list(results.keys()), list(results.values()))
plt.show()

In [None]:
from xgboost import XGBRegressor
