# Recap
- we have seen how to build Pipelines of Transformers and Estimators with scikit-learn (`Pipeline`, `make_pipeline`)
- we have seen how to create a concatenation of transformers with scikit-learn (`FeatureUnion`, `make_union`)

Now let's do some exercise with some data!

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../data/house_train.csv')

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

## Some pandas Recap

- Identify the types of column
- Identify columns containing missing values. 
- For each of these columns calculate percentage of data containing missing values. What would you do with these columns? 

<div>

for item in train.columns[train.isnull().any()].tolist():
    print(item,'-', train[item].dtype,'-', len(train[train[item].isnull()])/len(train))
</div>

In [5]:
data.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
                  ...   
BedroomAbvGr       int64
KitchenAbvGr       int64
KitchenQual       object
TotRmsAbvGrd       int64
Functional        object
Fireplaces         int64
FireplaceQu       object
GarageType        object
GarageYrBlt      float64


In [6]:
col_na = data.columns[data.isna().any()].tolist()

In [7]:
data.isnull().sum() / len(data)

Id               0.000000
MSSubClass       0.000000
MSZoning         0.000000
LotFrontage      0.177397
LotArea          0.000000
Street           0.000000
Alley            0.937671
LotShape         0.000000
LandContour      0.000000
Utilities        0.000000
LotConfig        0.000000
LandSlope        0.000000
Neighborhood     0.000000
Condition1       0.000000
Condition2       0.000000
BldgType         0.000000
HouseStyle       0.000000
OverallQual      0.000000
OverallCond      0.000000
YearBuilt        0.000000
YearRemodAdd     0.000000
RoofStyle        0.000000
RoofMatl         0.000000
Exterior1st      0.000000
Exterior2nd      0.000000
MasVnrType       0.005479
MasVnrArea       0.005479
ExterQual        0.000000
ExterCond        0.000000
Foundation       0.000000
                   ...   
BedroomAbvGr     0.000000
KitchenAbvGr     0.000000
KitchenQual      0.000000
TotRmsAbvGrd     0.000000
Functional       0.000000
Fireplaces       0.000000
FireplaceQu      0.472603
GarageType  

In [8]:
for col in col_na:
    perc = data[col].isnull().sum() / len(data)
    print(f'col: {col}: {perc:.2f}')


col: LotFrontage: 0.18
col: Alley: 0.94
col: MasVnrType: 0.01
col: MasVnrArea: 0.01
col: BsmtQual: 0.03
col: BsmtCond: 0.03
col: BsmtExposure: 0.03
col: BsmtFinType1: 0.03
col: BsmtFinType2: 0.03
col: Electrical: 0.00
col: FireplaceQu: 0.47
col: GarageType: 0.06
col: GarageYrBlt: 0.06
col: GarageFinish: 0.06
col: GarageQual: 0.06
col: GarageCond: 0.06
col: PoolQC: 1.00
col: Fence: 0.81
col: MiscFeature: 0.96


In [9]:
for item in data.columns[data.isnull().any()].tolist():
    print(item, '_', data[item].dtype, '-', len(data[data[item].isnull()])/len(data))

LotFrontage _ float64 - 0.1773972602739726
Alley _ object - 0.9376712328767123
MasVnrType _ object - 0.005479452054794521
MasVnrArea _ float64 - 0.005479452054794521
BsmtQual _ object - 0.025342465753424658
BsmtCond _ object - 0.025342465753424658
BsmtExposure _ object - 0.026027397260273973
BsmtFinType1 _ object - 0.025342465753424658
BsmtFinType2 _ object - 0.026027397260273973
Electrical _ object - 0.0006849315068493151
FireplaceQu _ object - 0.4726027397260274
GarageType _ object - 0.05547945205479452
GarageYrBlt _ float64 - 0.05547945205479452
GarageFinish _ object - 0.05547945205479452
GarageQual _ object - 0.05547945205479452
GarageCond _ object - 0.05547945205479452
PoolQC _ object - 0.9952054794520548
Fence _ object - 0.8075342465753425
MiscFeature _ object - 0.963013698630137


This is what we are going to do in this exercise

<img src='../images/diagram_exercise.png'>

In [10]:
# drop MiscFeature, Fence, PoolQC, Alley
data.drop(labels=['MiscFeature','Fence','PoolQC','Alley'],axis=1,inplace=True)

In [11]:
data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodD

In [12]:
# drop rows
data = data[(data.MasVnrArea.isnull()==False) & (data.MasVnrType.isnull()==False) & (data.Electrical.isnull()==False)]

In [13]:
len(data)

1451

In [14]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


Exercises:
-----------
1. create a Tranformer ```Selector``` that selects columns from the dataframe (*hint*: the Transformer takes a list of columns as input and returns the dataframe with only the selected columns
2. What kind of transformer is it? Stateful? Stateless?

In [15]:
class Selector():
    
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        return df.loc[:,self.columns]
        

In [16]:
select_id = Selector(columns = ['Id'])

In [17]:
select_id.transform(data).head()

Unnamed: 0,Id
0,1
1,2
2,3
3,4
4,5


In [18]:
# this is a stateless transformer

**Solution**
<div>
from sklearn.base import BaseEstimator, TransformerMixin

class PandasSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        return X.loc[:,self.columns]
</div>

Exercise:
----------
1. create a Transformer that takes a list of categorical columns and replace the missing values with a new value (ex.: 'Not-given'). The new value can be given as an attribute 
2. create a Transformer that takes a list of categorical columns and replace the missing values with the most common value
3. verify the Transformers above on one test column

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin

class Replacer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, new_value):
        self.columns = columns
        self.new_value = new_value
    
    def fit(self, df, y = None):
        return self
    
    def transform(self, df):
        df_ = df
        df_[self.columns] = df.loc[:,self.columns].fillna(self.new_value)
        return df_
    

In [20]:
for item in data.columns[data.isnull().any()].tolist():
    print(item, '_', data[item].dtype, '-', len(data[data[item].isnull()])/len(data))

LotFrontage _ float64 - 0.17711922811853895
BsmtQual _ object - 0.025499655410062026
BsmtCond _ object - 0.025499655410062026
BsmtExposure _ object - 0.026188835286009647
BsmtFinType1 _ object - 0.025499655410062026
BsmtFinType2 _ object - 0.026188835286009647
FireplaceQu _ object - 0.4720882150241213
GarageType _ object - 0.055823569951757405
GarageYrBlt _ float64 - 0.055823569951757405
GarageFinish _ object - 0.055823569951757405
GarageQual _ object - 0.055823569951757405
GarageCond _ object - 0.055823569951757405


In [21]:
replace_nan_GarageType = Replacer(columns=['GarageType'], new_value='none')

In [22]:
rep = replace_nan_GarageType.transform(data)

In [23]:
for item in rep.columns[rep.isnull().any()].tolist():
    print(item, '_', rep[item].dtype, '-', len(rep[data[item].isnull()])/len(rep))

LotFrontage _ float64 - 0.17711922811853895
BsmtQual _ object - 0.025499655410062026
BsmtCond _ object - 0.025499655410062026
BsmtExposure _ object - 0.026188835286009647
BsmtFinType1 _ object - 0.025499655410062026
BsmtFinType2 _ object - 0.026188835286009647
FireplaceQu _ object - 0.4720882150241213
GarageYrBlt _ float64 - 0.055823569951757405
GarageFinish _ object - 0.055823569951757405
GarageQual _ object - 0.055823569951757405
GarageCond _ object - 0.055823569951757405


In [24]:
class ReplacerCommon(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        assert X.dtypes.all() == 'object'
        
        self.fill = pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns)
        return self
    
    def transform(self, X):
        X_ = X.fillna(self.fill)
        return X_

In [25]:
replacer = ReplacerCommon()

In [26]:
X = data[['BsmtQual', 'BsmtCond', 'BsmtExposure']]
replacer.fit(X)

ReplacerCommon()

In [27]:
replacer.transform(X).head()

Unnamed: 0,BsmtQual,BsmtCond,BsmtExposure
0,Gd,TA,No
1,Gd,TA,Gd
2,Gd,TA,Mn
3,TA,Gd,No
4,Gd,TA,Av


**Solution**

<div>
class CategoricanImputerNew(BaseEstimator,TransformerMixin):
    
    def __init__(self, value):
        self.value = value
    
    def fit(self, X, y=None):
        assert X.dtypes.all() == 'object'
        return self
    
    def transform(self, X):
        X_ = X.fillna(self.value)
        return X_
        
class CategoricalImputerMostCommon(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        """assert that all columns are of categorical type"""
    
    def fit(self, X, y=None):
        assert X.dtypes.all() == 'object'
        self.fill = pd.Series([X[c].value_counts().index[0] for c in X],
            index=X.columns)
        return self
    
    def transform(self, X):
        X_ = X.fillna(self.fill)
        return X_
</div>

In [28]:
#train.groupby(['BsmtQual']).size()
#c = CategoricalImputerMostCommon()
#c.fit(train[['BsmtQual']])
#c_t = c.transform(train[['BsmtQual']])

Exercise:
----------
1. What would you do for missing values in numerical columns? Have a look at scikit-learn `Imputer`
2. Construct the first piece of the pipeline by combining the Columns selector and the inputer for Numerical and Categorical columns

In [29]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, make_union
from pandas.api.types import is_numeric_dtype
from category_encoders.one_hot import OneHotEncoder

num_cols = []
cat_cols = []
for col in data.columns:
    if is_numeric_dtype(data[col]):
        num_cols.append(col)
    else:
        cat_cols.append(col)

# we need to remove te SalePrice because this is the label
num_cols.remove('SalePrice')

In [30]:
data[num_cols].head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,548,0,61,0,0,0,0,0,2,2008
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,460,298,0,0,0,0,0,0,5,2007
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,608,0,42,0,0,0,0,0,9,2008
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,642,0,35,272,0,0,0,0,2,2006
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,836,192,84,0,0,0,0,0,12,2008


In [31]:
data[cat_cols].head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Gd,Typ,,Attchd,RFn,TA,TA,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,TA,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,Gd,Typ,Gd,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal


In [32]:
imp = SimpleImputer(strategy='median')


In [33]:
tf = imp.fit_transform(data[num_cols])

In [34]:
first_union = make_union (
    #some numerical columns
        make_pipeline(
            Selector(num_cols),
            SimpleImputer(strategy='median')
        ),
    #some categorical columns
        make_pipeline(
            Selector(cat_cols),
            ReplacerCommon(),
            OneHotEncoder(),
        ))

In [35]:
first_union.fit(data)

FeatureUnion(n_jobs=None,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('selector', <__main__.Selector object at 0x11fa08390>), ('simpleimputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0))])), ('pipeline-2', Pipeline(memory=None,
     steps=[('sel...e_unknown='impute', impute_missing=True,
       return_df=True, use_cat_names=False, verbose=0))]))],
       transformer_weights=None)

In [36]:
X_ = first_union.transform(data)

In [37]:
np.shape(data)

(1451, 77)

In [38]:
np.shape(X_)

(1451, 316)

In [39]:
X_

array([[1.000e+00, 6.000e+01, 6.500e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.000e+00, 2.000e+01, 8.000e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.000e+00, 6.000e+01, 6.800e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [1.458e+03, 7.000e+01, 6.600e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.459e+03, 2.000e+01, 6.800e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.460e+03, 2.000e+01, 7.500e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

**Solution**
<div>
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Imputer

num_columns = ['OverallQual',
'GrLivArea',
'GarageCars',
'GarageArea',
'TotalBsmtSF',
'1stFlrSF',
'FullBath',
'TotRmsAbvGrd',
'YearBuilt',
'YearRemodAdd',
'GarageYrBlt',
'MasVnrArea',
'Fireplaces',
'BsmtFinSF1',
'LotFrontage']
cat_columns = ['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1']

first_union = make_union(
    #some numerical columns
        make_pipeline(
            PandasSelector(num_columns),
            Imputer(strategy='median')
        ),
    #some categorical columns
        make_pipeline(
            PandasSelector(cat_columns),
            CategoricalImputerMostCommon()
            ))
</div>

In [40]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


Exercise:
----------
1. Have a look at the categorical features. How many unique values do they contain?
2. Have a look at OneHotEncoder from category_encoders: `from category_encoders.one_hot import OneHotEncoder`

In [41]:
# HERE! Discuss difference between pd.get_dummies and OneHotEncoder (scikit-learn vs category_encoders)

In [42]:
data[cat_cols].nunique()

MSZoning          5
Street            2
LotShape          4
LandContour       4
Utilities         2
LotConfig         5
LandSlope         3
Neighborhood     25
Condition1        9
Condition2        8
BldgType          5
HouseStyle        8
RoofStyle         6
RoofMatl          8
Exterior1st      15
Exterior2nd      16
MasVnrType        4
ExterQual         4
ExterCond         5
Foundation        6
BsmtQual          4
BsmtCond          4
BsmtExposure      4
BsmtFinType1      6
BsmtFinType2      6
Heating           6
HeatingQC         5
CentralAir        2
Electrical        5
KitchenQual       4
Functional        7
FireplaceQu       5
GarageType        7
GarageFinish      3
GarageQual        5
GarageCond        5
PavedDrive        3
SaleType          9
SaleCondition     6
dtype: int64

Exercise:
----------
Add OneHotEncoder to to the Union

**Click here to see the solution**
<div>
from category_encoders.one_hot import OneHotEncoder
final_union = make_union(
    #some numerical columns
        make_pipeline(
            PandasSelector(num_columns),
            Imputer(strategy='median')
        ),
    #some categorical columns
        make_pipeline(
            PandasSelector(cat_columns),
            CategoricalImputerMostCommon(),
            OneHotEncoder()
            ))
</div>

The final step would be to add an estimator (a predictor) to the pipeline above.

Exercise
---------

1. Run and understand the code below (explain each line)
2. The problem is to predict future prices is cross validation a good method?
3. Try to use a train/test split based on YrSold

In [43]:
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
import math

est = RandomForestRegressor()

predictions = cross_val_predict(make_pipeline(first_union, est), # the pipeline
                                data,                            # X
                                np.log1p(data.SalePrice),        # y, the ground truth
                                cv=5)                            # 5-fold cross validation

print(math.sqrt(mean_squared_error(data.SalePrice, np.expm1(predictions))))



32271.620423229902


In [44]:
training_data = data[data.YrSold <= 2009]
test_data = data[data.YrSold > 2009]

model = make_pipeline(first_union, est)

model.fit(training_data, np.log1p(training_data.SalePrice)) # train the model on the training data

predictions = model.predict(test_data)                      # make predictions

print(math.sqrt(mean_squared_error(np.log(test_data.SalePrice), np.log(np.expm1(predictions)))))



0.1338373724175


**Solution**

<div>
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
#from sklearn.metrics import rmse
from sklearn.metrics import mean_squared_error

est = RandomForestRegressor()
training_data = data[data.YrSold <= 2009]
test_data = data[data.YrSold > 2009]
model = make_pipeline(final_union, est)
model.fit(training_data, np.log1p(training_data.SalePrice))
predictions = model.predict(test_data)
print(math.sqrt(mean_squared_error(np.log(test_data.SalePrice), np.log(np.expm1(predictions)))))
</div>

# Experiments with Time Series

The dataframe contains house prices in a time range between 2006 and 2010. Let's have a look at the sale price based on time.

Exercise:
----------
- Create a pandas dataframe containing the average house prices per Year-Month (*hint*: first create a new datetype colum that concatenate YrSold and MoSold)
- Create a plot of the price by Month. Do you see some seasonality? Trend?

In [51]:
data['MonthSold'] = data['YrSold'].apply(str) + '-' + data['MoSold'].apply(str)
data.groupby(['MonthSold'])['SalePrice'].mean()

data['MonthSold'] = pd.to_datetime(data.YrSold*10000+data.MoSold*100, format='%Y%m')

MonthSold
2006-1     201090.000000
2006-10    172356.708333
2006-11    213285.000000
2006-12    185545.000000
2006-2     194322.222222
2006-3     184982.200000
2006-4     174312.814815
2006-5     158928.289474
2006-6     172283.333333
2006-7     183211.059701
2006-8     196239.956522
2006-9     206538.071429
2007-1     183234.615385
2007-10    215428.812500
2007-11    197614.086957
2007-12    214414.500000
2007-2     176301.750000
2007-3     175474.318182
2007-4     170772.608696
2007-5     169873.511628
2007-6     179725.813559
2007-7     195396.843137
2007-8     183941.075000
2007-9     195533.818182
2008-1     178504.692308
2008-10    166690.636364
2008-11    210981.058824
2008-12    175107.692308
2008-2     159370.000000
2008-3     175064.411765
2008-4     159293.346154
2008-5     188423.888889
2008-6     169730.941176
2008-7     174562.653061
2008-8     196076.965517
2008-9     169626.470588
2009-1     189735.500000
2009-10    175206.592593
2009-11    156381.818182
2009-12    1640

In [54]:
data['MonthSold'] = pd.to_datetime(data['YrSold']*10000 + data['MoSold']*100, format='%Y%m')

TypeError: 'int' object is unsliceable

<div>
train.loc[:,'Mo-YrSold'] = pd.to_datetime(train.YrSold.astype('str')+train.MoSold.astype('str'), format = '%Y%m')
ts_agg = train.groupby('Mo-YrSold')['SalePrice'].mean().reset_index(name='meanSalePrice')

from matplotlib import pyplot as plt
%matplotlib inline

ts_agg.plot(x='Mo-YrSold',y='meanSalePrice')
</div>

From the plot above we see a huge drop in summer 2010. Maybe this is due to too much noise since we are considering ALL the houses together? Let's try to build this plot for different overall conditions

Exercise:
----------
Consider the dataframe created above that contains the (average) price of houses by Month-Year. In this exercise we construct predictor for simple time series forecasting methods.
The predictor should have three methods: ```fit``` (to fit the data), ```predict``` (to return the prediction) and ```score``` (to calculate the error). 

1. Simple Average: Build an estimator that when fitted on the dataframe computes the avg price overall and returns it as prediction. Which metric would you choose for score?
2. Simple Moving Average: Generalize the estimator above so that a simple moving average of the last k points is possible (*hint* k should be given as argument in __init__ method)
3. Weighted Moving Average: Generalize the estimator above so that a weighted moving average of the last k points is given (*hint* beside k, a list of weights should be given as argument)

Plot the predictions for the three predictors above

In [46]:
from sklearn.base import RegressorMixin

**Solution**
<div>
import math

class AvgForecast(BaseEstimator,RegressorMixin):
    """Calculate avg and return it as prediction"""
        
    def fit(self, X):
        """Calculate avg of X values"""
        self.avg = np.mean(X)
        return self
    
    def predict(self, X):
        self.predict = np.array([self.avg for x in X])
        return self.predict
    
    def score(self, X):
        """Calculate rmse"""
        return math.sqrt(np.sum((X-self.predict)**2)/len(X))
 
class MovAvgForecast(BaseEstimator,RegressorMixin):
    """Calculate avg and return it as prediction"""
    
    def __init__(self, k):
        self.k = k
        
    def fit(self, X):
        """Calculate avg of X values"""
        k = self.k
        n = len(X)
        X_k = X[n-k:n]
        self.avg = np.mean(X_k)
        return self
    
    def predict(self, X):
        self.predict = np.array([self.avg for x in X])
        return self.predict
    
    def score(self, X):
        """Calculate rmse"""
        return math.sqrt(np.sum((X-self.predict)**2)/len(X))

    
class MovWeightAvgForecast(BaseEstimator,RegressorMixin):
    """Calculate avg and return it as prediction"""
    
    def __init__(self, k, weights):
        self.k = k
        self.weights = weights
        assert len(weights)==k
        assert np.sum(weights)==1
        
    def fit(self, X):
        """Calculate avg of X values"""
        k = self.k
        weights = self.weights
        n = len(X)
        X_k = X[n-k:n]
        self.avg = np.average(X_k,axis=0,weights=weights)
        return self
    
    def predict(self, X):
        self.predict = np.array([self.avg for x in X])
        return self.predict
    
    def score(self, X):
        """Calculate rmse"""
        return math.sqrt(np.sum((X-self.predict)**2)/len(X))
</div>

In [47]:
#x = np.array(ts_agg['meanSalePrice'])
#pred1 = pred1[0:10]  #pred1 is the result of predict for the first estimator
#x1 = np.append(x,pred1)
#pred2 = pred2[0:10]
#x2 = np.append(x,pred2)
#pred3 = pred3[0:10]
#x3 = np.append(x,pred3)

In [48]:
#fig = plt.figure()
#
#plt.plot(x1, color='r')
#plt.plot(x2, color='g')
#plt.plot(x3, color='b')


___________________________________

In [49]:
fig, ax = plt.subplots()
fig.set_figheight(5)
fig.set_figwidth(15)
for c in cond_unique:
    c_train= train[train['OverallCond']==c].groupby('Mo-YrSold')['SalePrice'].mean().reset_index(name='meanSalePrice')
    ax.plot(c_train['Mo-YrSold'], c_train['meanSalePrice'], label='OverallCond-'+str(c))
leg = ax.legend();


NameError: name 'plt' is not defined