# <span style="color: green">Part II – Pre-Processing

In [1]:
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

In [2]:
raw_data =  pd.read_csv('train.csv', header=0,  sep=',')

In [None]:
bikeDataPP = raw_data.copy()

## Scatter Matrix of weather features:

In [None]:
scatter = scatter_matrix(bikeDataPP[['temp','atemp','humidity','windspeed']], figsize=(16, 10))

### Removing outliers:

In [None]:
bikeDataPP.loc[:,'color'] = 'lavender'
# color lavender found here: https://matplotlib.org/examples/color/named_colors.html
bikeDataPP.loc[bikeDataPP.humidity < 10, 'color'] = 'black'
bikeDataPP.loc[bikeDataPP.windspeed > 45, 'color'] = 'black'
bikeDataPP.loc[(bikeDataPP.atemp < 15) & (bikeDataPP.temp > 25), 'color'] = 'black'
bikeDataPP.loc[(bikeDataPP.atemp > 24) & (bikeDataPP.atemp < 27) & (bikeDataPP.temp > 23),'color'] = 'black'
bikeDataPP.loc[(bikeDataPP.atemp > 44) & (bikeDataPP.humidity < 37.5),'color'] = 'black'
bikeDataPP.loc[(bikeDataPP.atemp > 42) & (bikeDataPP.humidity < 32),'color'] = 'black'
bikeDataPP.loc[(bikeDataPP.atemp < 5) & (bikeDataPP.humidity > 55),'color'] = 'black'
bikeDataPP.loc[(bikeDataPP.atemp < 2) & (bikeDataPP.humidity < 40),'color'] = 'black'
bikeDataPP.loc[(bikeDataPP.atemp < 6) & (bikeDataPP.humidity < 30),'color'] = 'black'
bikeDataPP.loc[(bikeDataPP.atemp > 25) & (bikeDataPP.humidity < 11) & (bikeDataPP.humidity > 9),'color'] = 'black'

## Scatter Matrix of weather features showing outliers:

In [None]:
scatter = scatter_matrix(bikeDataPP[['temp','atemp','humidity','windspeed']], figsize=(16, 10), color = bikeDataPP['color'])

In [None]:
bikeDataPP.drop(bikeDataPP[bikeDataPP.color == 'black'].index, inplace=True)

In [None]:
bikeDataPP.drop('color', axis=1, inplace=True)

## Scatter Matrix of weather features after removing outliers:

In [None]:
scatter = scatter_matrix(bikeDataPP[['temp','atemp','humidity','windspeed']], figsize=(16, 10))

## <U>Custom transformers:</U>

### Column Renamer Transformer

In [None]:
class Column_Renamer(BaseEstimator, TransformerMixin):
    def __init__(self, colname, new_name):
        self.colname = colname
        self.new_name = new_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):    
        X_copy = X.copy()
        X_copy.rename(columns={self.colname:self.new_name}, inplace=True)
        return X_copy

In [None]:
count_to_total_column_renamer = Column_Renamer('count','total')

### Column Copier Transformer

In [None]:
class Copy_Column(BaseEstimator, TransformerMixin):
    def __init__(self, colname, new_name):
        self.colname = colname
        self.new_name = new_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):    
        X_copy = X.copy()
        X_copy[self.new_name] = X_copy[self.colname]
        return X_copy

In [None]:
copy_hour_to_hour_reg_transformer = Copy_Column('hour', 'hour_reg')
copy_hour_to_hour_cas_transformer = Copy_Column('hour', 'hour_cas')

### Drop Column Transformer

In [None]:
class Drop_Column(BaseEstimator, TransformerMixin):
    def __init__(self, colname):
        self.colname = colname
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):    
        X_copy = X.copy()
        X_copy.drop(self.colname, axis=1, inplace=True)
        return X_copy

In [None]:
drop_temp_column_transformer = Drop_Column('temp')
drop_hour_column_transformer = Drop_Column('hour')
drop_weekday_column_transformer = Drop_Column('weekday')

### Binned Data Digitizer Transformer

In [None]:
class Digitizer(BaseEstimator, TransformerMixin):
    def __init__(self, colname, bins):
        self.colname = colname
        self.bins = bins
    def fit(self, X, y=None):
        self.labels = ['{}_{}'.format(low, high) for low, high in
                       zip([x+1 if x!=self.bins[0] else x for x in self.bins[:-1]], self.bins[1:])]
        return self
    def transform(self, X, y=None):    
        X_copy = X.copy()
        binned = pd.cut(X_copy[self.colname], 
                        bins=self.bins, 
                        labels=self.labels, 
                        include_lowest=True)
        X_copy[self.colname] = binned.astype(str)
        return X_copy

In [None]:
windspeed_digitizer_transformer = Digitizer('windspeed', [0, 10, 20, 30, 45])
hour_reg_digitizer_transformer = Digitizer('hour_reg', [0, 6, 9, 15, 20, 23])
hour_cas_digitizer_transformer = Digitizer('hour_cas', [0, 8, 20, 23])

### Add custom detail date related columns transformer

In [None]:
def add_detail_date_columns(df):
    df.datetime = pd.DatetimeIndex(df.datetime)
    df['month'] = df.datetime.apply(lambda x:x.month)
    df['hour'] = df.datetime.apply(lambda x:x.hour)
    df['weekday'] = df.datetime.apply(lambda x:x.dayofweek)  # 0 is Mon,1 is Tue etc...
    df['isweekend'] = df.weekday.apply(lambda x:int(x in [5,6]))
    return df
    
add_detail_date_columns_transformer = FunctionTransformer(func=add_detail_date_columns, validate=False)

### Categorical data binarizer transformers

In [None]:
class Binarizer(BaseEstimator, TransformerMixin):
    def __init__(self, colname, drop_first):
        self.colname = colname
        self.drop_first = drop_first
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        dummies = pd.get_dummies(X[self.colname], 
                                 prefix=self.colname, 
                                 columns=[self.colname], 
                                 drop_first=self.drop_first)
        X_no_col = X.drop(self.colname, axis=1, inplace=False)
        X_with_dummies = X_no_col.join(dummies)
        return X_with_dummies

In [None]:
windspeed_binarizer_transformer = Binarizer(colname='windspeed', drop_first=False)
season_binarizer_transformer = Binarizer(colname='season', drop_first=False)
weather_binarizer_transformer = Binarizer(colname='weather', drop_first=False)
month_binarizer_transformer = Binarizer(colname='month', drop_first=False)
hour_reg_binarizer_transformer = Binarizer(colname='hour_reg', drop_first=False)
hour_cas_binarizer_transformer = Binarizer(colname='hour_cas', drop_first=False)

### Standard Scaler Transformer

In [None]:
class Standard_Scaler(BaseEstimator, TransformerMixin):
    def __init__(self, colnames):
        self.colnames = colnames
    def fit(self, X, y=None):
        self.scaler = StandardScaler().fit(X.loc[:,self.colnames])
        return self
    def transform(self, X, y=None):    
        X_copy = X.copy()
        X_copy.loc[:,self.colnames] = self.scaler.transform(X_copy.loc[:,self.colnames])
        return X_copy

In [None]:
standard_scaler_transformer = Standard_Scaler(['atemp','humidity'])

# Pipeline:

In [None]:
steps = [('count_to_total_column_renamer', count_to_total_column_renamer),         
         ('add_detail_date_columns_transformer', add_detail_date_columns_transformer),
         ('drop_weekday_column_transformer', drop_weekday_column_transformer),
         ('drop_temp_column_transformer', drop_temp_column_transformer),
         ('copy_hour_to_hour_reg_transformer', copy_hour_to_hour_reg_transformer),
         ('copy_hour_to_hour_cas_transformer', copy_hour_to_hour_cas_transformer),
         ('drop_hour_column_transformer', drop_hour_column_transformer),
         ('windspeed_digitizer_transformer', windspeed_digitizer_transformer),
         ('windspeed_binarizer_transformer', windspeed_binarizer_transformer),
         ('season_binarizer_transformer', season_binarizer_transformer),
         ('weather_binarizer_transformer', weather_binarizer_transformer),
         ('month_binarizer_transformer', month_binarizer_transformer),
         ('hour_reg_digitizer_transformer', hour_reg_digitizer_transformer),
         ('hour_reg_binarizer_transformer', hour_reg_binarizer_transformer),
         ('hour_cas_digitizer_transformer', hour_cas_digitizer_transformer),
         ('hour_cas_binarizer_transformer', hour_cas_binarizer_transformer),
         ('standard_scaler_transformer', standard_scaler_transformer)]

transformers_pipeline = Pipeline(steps)

In [None]:
bikeDataPP = transformers_pipeline.fit_transform(bikeDataPP)

In [None]:
bikeDataPP.head()

In [None]:
bikeDataPP.info()

## Saving the prepared data

In [None]:
bikeDataPP.to_csv('bikeDataPP.csv',index= True, index_label='Index')