In [12]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from itertools import combinations, product

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr, spearmanr

# Custom Imports
import wrangle

## Acquire and Prepare Zillow

In [13]:
df = wrangle.wrangle_zillow()
df.head(5)

Unnamed: 0,bedroomcnt,bathroomcnt,square_feet,taxvaluedollarcnt,yearbuilt,taxamount,fips_name
0,4,2.0,3633,296425,2005,6941.39,Los Angeles
1,3,4.0,1620,847770,2011,10244.94,Los Angeles
2,3,2.0,2077,646760,1926,7924.68,Los Angeles
6,3,1.0,1244,169471,1950,2532.88,Los Angeles
7,3,2.0,1300,233266,1950,3110.99,Los Angeles


In [14]:
train, validate, test = wrangle.train_validate_test_split(df)

In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1169459 entries, 1483210 to 1225832
Data columns (total 7 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   bedroomcnt         1169459 non-null  int64  
 1   bathroomcnt        1169459 non-null  float64
 2   square_feet        1169459 non-null  int64  
 3   taxvaluedollarcnt  1169459 non-null  int64  
 4   yearbuilt          1169459 non-null  int64  
 5   taxamount          1169459 non-null  float64
 6   fips_name          1169459 non-null  object 
dtypes: float64(2), int64(4), object(1)
memory usage: 71.4+ MB


In [16]:
train_scaled, validate_scaled, test_scaled = wrangle.scale_data(train, validate, test)

In [17]:
train_scaled.head(5)

Unnamed: 0,bedroomcnt,bathroomcnt,square_feet,taxvaluedollarcnt,yearbuilt,taxamount,fips_name
1483210,0.2,0.272727,0.072759,0.044189,1949,1633.33,Los Angeles
635933,0.4,0.272727,0.074562,0.054043,1964,1711.51,Los Angeles
647772,0.6,0.363636,0.123669,0.378193,1979,8660.46,Orange
1251279,0.2,0.090909,0.011526,0.005963,1920,758.49,Los Angeles
772816,0.4,0.272727,0.040804,0.096488,1972,2528.32,Orange


In [18]:
def wrangle_split_scale():
    
    df = wrangle_zillow()
    train, validate, test = train_validate_test_split(df)
    train_scaled, validated_scaled, test_scaled = scale_data(train, validate, test)
    
    return train_scaled, validate_scaled, test_scaled

### Everything above shows how I created my main function. I combined my wrangle, split and scale functions into one called wrangle_split_scale

In [19]:
train_scaled.head(5)      ## WE will be using scaled data for all of exploration, modeling and testing. 

Unnamed: 0,bedroomcnt,bathroomcnt,square_feet,taxvaluedollarcnt,yearbuilt,taxamount,fips_name
1483210,0.2,0.272727,0.072759,0.044189,1949,1633.33,Los Angeles
635933,0.4,0.272727,0.074562,0.054043,1964,1711.51,Los Angeles
647772,0.6,0.363636,0.123669,0.378193,1979,8660.46,Orange
1251279,0.2,0.090909,0.011526,0.005963,1920,758.49,Los Angeles
772816,0.4,0.272727,0.040804,0.096488,1972,2528.32,Orange


In [20]:
train.head()   ## The original untouched set is for data visuals only, but apparently for these exercises too

Unnamed: 0,bedroomcnt,bathroomcnt,square_feet,taxvaluedollarcnt,yearbuilt,taxamount,fips_name
1483210,2,2.0,1733,98022,1949,1633.33,Los Angeles
635933,3,2.0,1766,117631,1964,1711.51,Los Angeles
647772,4,2.5,2665,762659,1979,8660.46,Orange
1251279,2,1.0,612,21957,1920,758.49,Los Angeles
772816,3,2.0,1148,202092,1972,2528.32,Orange


In [21]:
import explore

NameError: name 'train' is not defined

#### Write a function named plot_variable_pairs that accepts a dataframe as input and plots all of the pairwise relationships along with the regression line for each pair.

In [None]:
def plot_variable_pairs(df,
                        columns_x = ['bedroomcnt','bathroomcnt','square_feet','yearbuilt','taxvaluedollarcnt','taxamount'],
                        columns_y = ['bedroomcnt','bathroomcnt','square_feet','yearbuilt','taxvaluedollarcnt','taxamount'],
                        sampling = 1000):
    pairs = product(columns_x, columns_y)
    for pair in pairs:
        sns.lmplot(x=pair[0], y=pair[1], data=df.sample(sampling), line_kws={'color': 'blue'})
        plt.show()

plot_variable_pairs(train)

#### Write a function named plot_categorical_and_continuous_vars that accepts your dataframe and the name of the columns that hold the continuous and categorical features and outputs 3 different plots for visualizing a categorical variable and a continuous variable.

In [None]:
def plot_categorical_and_continuous_vars(df,
                                         columns_cat=['fips_name'],
                                         columns_cont=['square_feet', 'yearbuilt', 'bedroomcnt', 'bathroomcnt', 'taxvaluedollarcnt', 'taxamount'],
                                         sampling = 1000):
    pairs = product(columns_cat, columns_cont)
    for pair in pairs:
        sns.set(rc={"figure.figsize":(15, 6)}) 
        fig, axes = plt.subplots(1, 3)

        sns.stripplot(x=pair[0], y=pair[1], data=df.sample(sampling), ax = axes[0])
        sns.boxplot(x=pair[0], y=pair[1], data=df.sample(sampling), ax = axes[1])
        sns.barplot(x=pair[0], y=pair[1], data=df.sample(sampling), ax = axes[2])

        plt.show

plot_categorical_and_continuous_vars(train, sampling = 10_000)

 #### Save the functions you have written to create visualizations in your explore.py file. Rewrite your notebook code so that you are using the functions imported from this file.

In [None]:
## import explore  functions are now imported to py file, in their place is just me calling them

#### Use the functions you created above to explore your Zillow train dataset in your explore.ipynb notebook.