In [98]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

In [99]:
from wrangle import wrangle_telco

In [100]:
df = wrangle_telco()
df.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25
1,0014-BMAQU,84.65,63,5377.8
2,0016-QLJIS,90.45,65,5957.9
3,0017-DINOC,45.2,54,2460.55
4,0017-IUDMW,116.8,72,8456.75


In [101]:
X = df[['monthly_charges', 'tenure']]
y = df[['total_charges']]
train_pct = .8

In [102]:
def split_my_data(X, y, train_pct):
    X_train, X_test, y_train, y_test = train_test_split(X, y , train_size = train_pct, random_state = 1)
    return X_train, X_test, y_train, y_test

# def split_my_data(X, y, train_pct):
#     train, test = train_test_split(df, train_size = train_pct, random_state = 47)
#     return train, test

In [103]:
X_train

Unnamed: 0,monthly_charges,tenure
1374,80.60,54
484,110.05,69
1394,54.85,16
211,25.45,61
1049,86.40,71
...,...,...
722,99.35,69
913,20.15,47
1105,20.00,63
236,19.75,57


In [110]:

def standard_scaler(X_train, X_test):
    """Takes in X_train and X_test and performs standard scaling from the X_train onto X_test"""
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train),
                                  columns=X_train.columns,
                                  index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test),
                                 columns=X_test.columns,
                                 index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled


In [111]:
scaler, X_train_scaled, X_test_scaled = standard_scaler(X_train, X_test)

In [116]:
def scale_inverse(scaler, X_train_scaled, X_test_scaled):
    X_train_unscaled = (pd.DataFrame(scaler.inverse_transform(X_train_scaled),
                                    columns=X_train_scaled.columns,
                                    index=X_train_scaled.index))
    X_test_unscaled = (pd.DataFrame(scaler.inverse_transform(X_test_scaled),
                                   columns=X_test_scaled.columns,
                                   index=X_test_scaled.index))
    return X_train_unscaled, X_test_unscaled

In [118]:
X_train_unscaled, X_test_unscaled = scale_inverse(scaler, X_train_scaled, X_test_scaled)
X_train_unscaled.head()

Unnamed: 0,monthly_charges,tenure
1374,80.6,54.0
484,110.05,69.0
1394,54.85,16.0
211,25.45,61.0
1049,86.4,71.0


In [119]:
def uniform_scaler(X_train, X_test):
    """Quantile transformer, non_linear transformation - uniform.
       Reduces the impact of outliers, smooths out unusual distributions.
       Takes in a X_train and X_test dfs
       Returns the scaler, X_train_scaled, X_test_scaled
    """
    scaler = (QuantileTransformer(n_quantiles=100,
                                 output_distribution='uniform',
                                 random_state=123,
                                 copy=True,)
                                .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train),
                                  columns=X_train.columns,
                                  index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test),
                                 columns=X_test.columns,
                                 index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [120]:
def gaussian_scaler(X_train, X_test):
    """Transforms and then normalizes data.
       Takes in X_train and X_test dfs, 
       yeo_johnson allows for negative data,
       box_cox allows positive data only.
       Returns Zero_mean, unit variance normalized X_train_scaled and X_test_scaled and scaler.
    """
    scaler = (PowerTransformer(method='yeo-johnson', 
                               standardize=False, 
                               copy=True)
                              .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [121]:
def min_max_scaler(X_train, X_test):
    """Transforms features by scaling each feature to a given range.
       Takes in X_train and X_test,
       Returns the scaler and X_train_scaled and X_test_scaled within range.
       Sensitive to outliers.
    """
    scaler = (MinMaxScaler(copy=True, 
                           feature_range=(0,1))
                          .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [122]:
def iqr_robust_scaler(X_train, X_test):
    """Scales features using stats that are robust to outliers
       by removing the median and scaling data to the IQR.
       Takes in a X_train and X_test,
       Returns the scaler and X_train_scaled and X_test_scaled.
    """
    scaler = (RobustScaler(quantile_range=(25.0,75.0), 
                           copy=True, 
                           with_centering=True, 
                           with_scaling=True)
                          .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled