# Mall Customers Exercises

## notebook

1. Acquire data from mall_customers.customers in mysql database.
2. Summarize data (include distributions and descriptive statistics).
3. Detect outliers using IQR.
4. Split data (train, validate, and test split).
5. Encode categorical columns using a one hot encoder (pd.get_dummies).
6. Handles missing values.
7. Scaling

## wrangle_mall.py

1. Acquire data from mall_customers.customers in mysql database.
2. Split the data into train, validate, and split
3. One-hot-encoding (pd.get_dummies)
4. Missing values
5. Scaling

In [12]:
#import libraries
import pandas as pd
import numpy as np
import os
from pydataset import data
import scipy.stats as stats
import wrangle
import matplotlib.pyplot as plt

# acquire
from env import host, user, password
import acquire

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

#### #1 Acquire data from mall_customers.customers in mysql database.

In [2]:
#function in acquire.py
def get_connection(db_name):
    '''
    This function uses my info from my env file to
    create a connection url to access the Codeup db.
    '''
    from env import host, user, password
    return f'mysql+pymysql://{user}:{password}@{host}/{db_name}'

In [3]:
#function in acquire.py
def get_mall_customers():
    '''
    This function reads in the mall_customers data from the Codeup db
    returns: a pandas DataFrame 
    '''
    
    mall_query = '''
    SELECT *
    FROM customers
    '''
    return pd.read_sql(mall_query, get_connection('mall_customers'))

In [4]:
df = acquire.get_mall_customers()

In [5]:
#look at data
df.head()

Unnamed: 0_level_0,gender,age,annual_income,spending_score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Male,19,15,39
2,Male,21,15,81
3,Female,20,16,6
4,Female,23,16,77
5,Female,31,17,40


#### #2 Summarize data (include distributions and descriptive statistics).

In [6]:
def summarize(df):
    '''
    this function will take in a single argument (a pandas df) 
    output to console various statistics on said dataframe, including:
    #.head()
    #.info()
    #.describe()
    #.value_counts()
    #observation of nulls in the dataframe
    '''
    #print head
    print('=================================================')
    print('Dataframe head: ')
    print(df.head(3))
    
    #print info
    print('=================================================')
    print('Dataframe info: ')
    print(df.info())
    
    #print descriptive stats
    print('=================================================')
    print('DataFrame Description')
    print(df.describe())
    num_cols = df.select_dtypes(exclude='O').columns.to_list()
    cat_cols = df.select_dtypes(include='O').columns.to_list()
    
    #print value counts
    print('=================================================')
    print('Dataframe value counts: ')
    for col in df. columns:
        if col in cat_cols:
            print(df[col].value_counts())
        else:
            print(df[col].value_counts(bins=10, sort = False))
    
    #print nulls by column
    print('=================================================')
    print('nulls in dataframe by column: ')
    print(nulls_by_col(df))
    
    #print nulls by column
    print('=================================================')
    print('nulls in dataframe by row: ')
    print(nulls_by_row(df))
    print('=================================================')

In [7]:
df = wrangle.summarize(df)

Dataframe head: 
             gender  age  annual_income  spending_score
customer_id                                            
1              Male   19             15              39
2              Male   21             15              81
3            Female   20             16               6
Dataframe info: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   gender          200 non-null    object
 1   age             200 non-null    int64 
 2   annual_income   200 non-null    int64 
 3   spending_score  200 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 7.8+ KB
None
DataFrame Description
              age  annual_income  spending_score
count  200.000000     200.000000      200.000000
mean    38.850000      60.560000       50.200000
std     13.969007      26.264721       25.823522
min     18.000000      15.000000        1.000

#### #3 Detect outliers using IQR.

In [8]:
def detect_outliers(df, k, col_list):
    ''' 
    get upper and lower bound for list of columns in a dataframe 
    if desired return that dataframe with the outliers removed
    '''
    
    odf = pd.DataFrame()
    
    for col in col_list:
        
        # get quartiles
        q1, q2, q3 = df[f'{col}'].quantile([.25, .5, .75])  
        
        # calculate interquartile range
        iqr = q3 - q1   
        
        # get upper bound
        upper_bound = q3 + k * iqr
        # get lower bound
        lower_bound = q1 - k * iqr   
        
        # print each col and upper and lower bound for each column
        print(f"{col}: Median = {q2} lower_bound = {lower_bound} upper_bound = {upper_bound}")

        # return dataframe of outliers
        odf = odf.append(df[(df[f'{col}'] < lower_bound) | (df[f'{col}'] > upper_bound)])
            
    return odf

In [9]:
odf = detect_outliers(df, 1.5,['age', 'annual_income', 'spending_score'])

TypeError: 'NoneType' object is not subscriptable

#### #4 Split data (train, validate, and test split).

In [15]:
#function from wrangle.py
def zillow_split(df, target):
    '''
    This function take in get_zillow  from aquire.py and performs a train, validate, test split
    Returns train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test
    and prints out the shape of train, validate, test
    '''
    #create train_validate and test datasets
    train, test = train_test_split(df, train_size = 0.8, random_state = 123)
    #create train and validate datasets
    train, validate = train_test_split(train, train_size = 0.7, random_state = 123)

    #Split into X and y
    X_train = train.drop(columns=[target])
    y_train = train[target]

    X_validate = validate.drop(columns=[target])
    y_validate = validate[target]

    X_test = test.drop(columns=[target])
    y_test = test[target]

    # Have function print datasets shape
    print(f'train -> {train.shape}')
    print(f'validate -> {validate.shape}')
    print(f'test -> {test.shape}')
   
    return train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test

In [16]:
train, validate, test = wrangle.zillow_split(df, 'logerror')

TypeError: Expected sequence or array-like, got <class 'NoneType'>

#### #5 Encode categorical columns using a one hot encoder (pd.get_dummies).

#### #6 Handles missing values

In [None]:
#there are no missing values

#### #7 Scaling

In [None]:
#use min_max_scaler
#age, income, spending score