# pipline V2

***Macros***

In [113]:
PATH = "Datasets/housing.csv"
STRAT_COL = "median_income"

In [278]:
# imports 
import os
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

try: 
    from sklearn.impute import SimpleImputer
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer



In [279]:
#imports the dataset from filepath retrns a df object
def import_file(filepath):
    file = pd.read_csv(filepath)
    return file

In [280]:
# feature scaling of median_income reduces it to 5 bin catagories returns a new df object
def income_cat(df): 
    df["income_cat"] = pd.cut(df["median_income"], bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1, 2, 3, 4, 5])
    df = df.drop(["median_income"], axis="columns")
    return df

In [281]:
# Splits the data into the training and test sets. They will have an equal income_cat distribution
def strat_split_data(file):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(file, file["income_cat"]):
        strat_train_set = file.loc[train_index]
        strat_test_set = file.loc[test_index]
    return strat_train_set, strat_test_set

In [282]:
def plot_data(df):
    plot = df.plot(kind="scatter", figsize=(12,7), x='latitude',
                   y='longitude', c="median_house_value", cmap=plt.get_cmap("jet"),
                   s=df['population']/100, sharex=False)
    plot.legend()
    return plot

In [283]:
def plot_features(df):
    return df.hist(bins=50, figsize=(20,15))

In [284]:
def split_data_labels(df): 
    data = df.drop(["median_house_value"], axis='columns')
    labels = dfp["median_house_value"]
    return data, lables

## create pipeline


* Split DataFrame into catagorical and numeric parts

In [290]:
df = import_file(PATH)
list(df.keys())

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity']

In [291]:
def split_cat_num(df): 
    cat_df = df["ocean_proximity"]
    num_df = df.drop(["ocean_proximity"], axis="columns")
    return cat_df, num_df

In [292]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_ix, bedrooms_ix, population_ix, household_ix = [
        list(X.columns).index(col)
        for col in ("total_rooms", "total_bedrooms", "population", "households")]

        rooms_per_household = X.iloc[:, rooms_ix] / X.iloc[:, household_ix]
        population_per_household = X.iloc[:,household_ix] / X.iloc[:, population_ix]
        if self.add_bedrooms_per_room: 
            bedrooms_per_room = X.iloc[:, rooms_ix] / X.iloc[:, bedrooms_ix]
            return np.c_[X, bedrooms_per_room, population_per_household, rooms_per_household]
        else: 
            return np.c_[X, population_per_household, rooms_per_household]

In [293]:
class DataFrameSelecotr(BaseEstimator, TransformerMixin):
    def __init__(self, column_names):
        self.column_names = column_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.column_names].values

In [295]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [306]:
np.char.isnumeric(df["latitude"].iloc[0])

TypeError: isnumeric is only available for Unicode strings and arrays

In [299]:
key_list = list(df.columns)
for key in key_list:
    if df[key].iloc[0].isalpha():
        print("alphanumeric")
    else: 
        print("numeric")

AttributeError: 'numpy.float64' object has no attribute 'isalpha'