In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# splitting data:
from sklearn.model_selection import train_test_split

# web-based requests
import requests

# Importing the os library specifically for reading the csv once I've created the file in my working directory.
import os

import acquire
import prepare

credentials loaded successfully
End of file.


In [2]:
# Aquire data:

url = 'https://gist.githubusercontent.com/ryanorsinger/19bc7eccd6279661bd13307026628ace/raw/e4b5d6787015a4782f96cad6d1d62a8bdbac54c7/lemonade.csv'
    
df = pd.read_csv(url)    

In [3]:
df.head()

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales
0,1/1/17,Sunday,27.0,2.0,15,0.5,10
1,1/2/17,Monday,28.9,1.33,15,0.5,13
2,1/3/17,Tuesday,34.5,1.33,27,0.5,15
3,1/4/17,Wednesday,44.1,1.05,28,0.5,17
4,1/5/17,Thursday,42.4,1.0,33,0.5,18


In [4]:
# putting all column headers in lowercase:

df.columns = df.columns.str.lower()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         365 non-null    object 
 1   day          365 non-null    object 
 2   temperature  365 non-null    float64
 3   rainfall     365 non-null    float64
 4   flyers       365 non-null    int64  
 5   price        365 non-null    float64
 6   sales        365 non-null    int64  
dtypes: float64(3), int64(2), object(2)
memory usage: 20.1+ KB


In [10]:
train, validate, test = split_data(df)
train.shape, validate.shape, test.shape

((204, 7), (88, 7), (73, 7))

In [None]:
sns.relplot(x='temperature', y='sales', data=df)

In [None]:
df.groupby('day').describe()

In [None]:
for col in df.columns:
    plt.figure(figsize = (4, 2))
    plt.hist(df[col])
    plt.title(col)
    plt.show()

In [None]:
for col in df.columns:
    if df[col].dtype == 'float64':
        plt.figure(figsize = (4, 2))
        plt.hist(df[col])
        plt.title(col)
        plt.show()
    elif df[col].dtype == 'int64':
        plt.figure(figsize = (4, 2))
        plt.hist(df[col])
        plt.title(col)
        plt.show()

In [None]:
# Or use John's way of getting a list of columns that are the "correct" dtype:

lst = [column for column in df.columns if df[f'{column}'].dtype in ('int64','float64')]

In [None]:
for col in df.columns:
    if df[col].dtype in ('int64','float64'):
        plt.figure(figsize = (4, 2))
        plt.hist(df[col])
        plt.title(col)
        plt.show()

#### Takeaway:

- Looks like temp and rainfall are somewhat normally distributed, but with a skew (due to outliers).
- I should be good using the outlier detection on those columns since they do have a normal(ish) distribution.

# Exercises

### 1. Using lemonade.csv dataset and focusing on continuous variables:

- Use the IQR Range Rule and the upper and lower bounds to identify the lower outliers of each column of lemonade.csv, using the multiplier of 1.5. Do these lower outliers make sense? Which outliers should be kept?
- Use the IQR Range Rule and the upper and lower bounds to identify the upper outliers of each column of lemonade.csv, using the multiplier of 1.5. Do these lower outliers make sense? Which outliers should be kept?
- Using the multiplier of 3, IQR Range Rule, and the lower and upper bounds, identify the outliers below the lower bound in each colum of lemonade.csv. Do these lower outliers make sense? Which outliers should be kept?
- Using the multiplier of 3, IQR Range Rule, and the lower and upper bounds, identify the outliers above the upper_bound in each colum of lemonade.csv. Do these upper outliers make sense? Which outliers should be kept?

In [9]:
# Splitting the data:

def split_data(df):
    '''
    This function will split a dataframe into 3 dataframes: train, validate and test.
    The random state is set to 123 by default, the validate test_size argument is set to .2, and the test test_size is set to .3.
    '''
    train_validate, test = train_test_split(df, test_size = .2, random_state = 123)
    train, validate = train_test_split(train_validate, test_size = .3, random_state = 123)
    return train, validate, test
    print(train.shape, validate.shape, test.shape)

In [None]:
train, validate, test = prepare.split_data(df)
train.shape, validate.shape, test.shape

In [None]:
df

In [None]:
df1 = df.copy()
df1.describe()

In [None]:
# for example purposes, calculating temp manually just eyeballing the above table:

q1, q3 = df1.temperature.quantile([.25, .75])
iqr = q3 - q1
iqr

In [None]:
multiplier = 1.5

In [None]:
lower_fence = q1 - multiplier * iqr
lower_fence

In [None]:
upper_fence = q3 + multiplier * iqr
upper_fence

### Doing Temperature step by step
- So I know what I'm doing for when I build my function.

In [None]:
df1.head()

In [None]:
df1.temperature[df1.temperature >= upper_fence]

In [None]:
df1.temperature[df1.temperature <= lower_fence].shape

In [None]:
df1.temperature >= upper_fence

In [None]:
df1.temperature[-5:]

In [None]:
max(10-9, 0)

In [None]:
min(10-11, 0)

In [None]:
def get_upper_outliers(s, k):
    '''
    Given a series and a cutoff value, k, returns the upper outliers for the
    series.

    The values returned will be either 0 (if the point is not an outlier), or a
    number that indicates how far away from the upper bound the observation is.
    '''
    
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    upper_fence = q3 + k * iqr
    return s.apply(lambda x: max([x - upper_fence, 0]))

In [None]:
def get_lower_outliers(s, k):
    '''
    Given a series and a cutoff value, k, returns the lower outliers for the
    series.

    The values returned will be either 0 (if the point is not an outlier), or a
    number that indicates how far away from the lower bound the observation is.
    '''
    
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    lower_fence = q1 - k * iqr
    return s.apply(lambda x: max([lower_fence - x, 0]))

#### Adding columns for outliers

In [None]:
def add_upper_outlier_columns(df, k):
    '''
    Add a column with the suffix _outliers for all the numeric columns
    in the given dataframe.
    '''
    # outlier_cols = {col + '_outliers': get_upper_outliers(df[col], k)
    #                 for col in df.select_dtypes('number')}
    # return df.assign(**outlier_cols)

    for col in df.select_dtypes('number'):
        if not col.endswith('_outliers', -9):
            df[col + '_up_outliers'] = get_upper_outliers(df[col], k)

    return df

In [None]:
def add_lower_outlier_columns(df, k):
    '''
    Add a column with the suffix _outliers for all the numeric columns
    in the given dataframe.
    '''
    # outlier_cols = {col + '_outliers': get_upper_outliers(df[col], k)
    #                 for col in df.select_dtypes('number')}
    # return df.assign(**outlier_cols)

    for col in df.select_dtypes('number'):
        if not col.endswith('_outliers', (len('_outliers') * -1)):
            df[col + '_low_outliers'] = get_upper_outliers(df[col], k)

    return df

In [None]:
add_upper_outlier_columns(df1, 1.5)

df1.head()

In [None]:
df1.shape

In [None]:
add_lower_outlier_columns(df1, 1.5)
df1.sample(10)

In [None]:
df1.shape

In [None]:
outlier_cols = [col for col in df1 if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = df1[col][df1[col] > 0]
    print(data.describe())

#### Takeaways:

- The sales, flyers, and rainfall outliers are all most likely due to those columns have a fairly significant skew in their distributions. Looking above at the charts, it is apparent that there is a skew, thus using the IQR Range rule to isoluate outliers will produce a larger number of outliers, compared to a truly normal distribution.

#### Isolating the outliers

- Do these outliers make sense to get rid of or keep?

In [None]:
df1[df1.temperature_up_outliers != 0]

In [None]:
df1[df1.rainfall_up_outliers != 0]

In [None]:
df1[df1.flyers_up_outliers != 0]

In [None]:
df1[df1.price_up_outliers != 0]

In [None]:
df1[df1.sales_up_outliers != 0]

#### Low Outliers:

In [None]:
df1[df1.temperature_low_outliers != 0]

In [None]:
df1[df1.rainfall_low_outliers != 0]

In [None]:
df1[df1.flyers_low_outliers != 0]

In [None]:
df1[df1.price_low_outliers != 0]

In [None]:
df1[df1.sales_low_outliers != 0]

#### Takeaways:

- Rainfall has a lot of outliers, but that is most likely because rainfall is not normally distributed or has a skew in it's distribution.

In [None]:
# Attempting to print the outlier rows

for col in up_outlier:
    print('~~~\n' + col)
    data = df1[col][df1[col] > 0]
    print(data.head(10))

In [None]:
up_outlier = [col for col in df1.columns if col.endswith('_up_outliers', -12)]
up_outlier

### Using 3 as the multiplier

In [None]:
df2 = df.copy()

In [None]:
df2.describe()

In [None]:
add_upper_outlier_columns(df2, 3)

df2.head()

In [None]:
add_lower_outlier_columns(df2, 3)
df1.shape

In [None]:
outlier_cols = [col for col in df2 if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = df2[col][df2[col] > 0]
    print(data.describe())

In [None]:
df2.describe()

#### Takeaways:

- Same results as above; but the distance of the outliers is slightly lower as one would expect with a larger multiplier for the standard deviation calculation.

- The `rainfall` 

In [None]:
df_cols = [col for col in df2.columns if col.endswith('_outliers', -9)]

In [None]:
type(df_cols[0])

In [None]:
df[df_cols]

In [None]:
# Given some data
import numpy as np
import pandas as pd
x = np.random.normal(50, 40, size=1000)

# Calculate the z-score 
zscores = pd.Series((x - x.mean()) / x.std())

# Finds all of the observations two standard deviations or more.
x[zscores.abs() >= 2]

# Finds all of the observations three standard deviations or more
x[zscores.abs() >= 3]

### 2. Identify if any columns in lemonade.csv are normally distributed. For normally distributed columns:

- Use a 2 sigma decision rule to isolate the outliers.
- Do these make sense?
- Should certain outliers be kept or removed?

In [None]:
# Showing charts again to isolate which distributions are normal:

for col in df.columns:
    if df[col].dtype in ('int64','float64'):
        plt.figure(figsize = (4, 2))
        plt.hist(df[col])
        plt.title(col)
        plt.show()

##### Result: `temperature`, `rainfall`, and `flyers` are fairly normally distributed (with a skew)

- z-score relies on the distributions being normal.

In [None]:
norm_dist = ['temperature', 'rainfall', 'flyers']

In [None]:
x = df.temperature
zscores = (x - x.mean()) / x.std()

zdf = pd.DataFrame()
zdf["x"] = x
zdf["zscore"] = abs(zscores)

In [None]:
# Looking first at all values that are 1.5 std. dev from the mean:
zdf[zscores >= 1.5]

In [None]:
# Now looking first at all values that are 2 and 3 std. dev from the mean:
zdf[zscores >= 2]

In [None]:
zdf[zscores >= 3]