In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/sales.csv')

In [3]:
df.dropna(subset=['price'], inplace=True)

In [4]:
df.head()

Unnamed: 0,cost,price,weight,purchase_date,product_type,product_level,maker,ingredient,height,width,depth
0,$333k,"$300,492",3 Ton 90 Kg,Dec 19 2008,"Q,B",advanced,M14122,"IN732052,IN732053",2.76 meters,97 cm,26 cm
1,,"$430,570",3 Ton 30 Kg,Sep 10 1997,"J,D",basic,,"IN732054,IN732055,IN732056,IN732057,IN732058",2.67 meters,98 cm,26 cm
2,$270k,"$213,070",3 Ton 40 Kg,Sep 05 2001,"J,D",basic,,"IN732054,IN732059,IN732060",3.0 meters,93 cm,24 cm
3,,"$229,174",3 Ton 50 Kg,Dec 23 2016,U,advanced,M14123,"IN732061,IN732062,IN732063",2.5 meters,102 cm,27 cm
4,$97k,"$122,659",2 Ton 970 Kg,Jan 12 2000,"D,R",advanced,,"IN732064,IN732065,IN732066",2.47 meters,101 cm,26 cm


In [5]:
df['year'] = pd.to_datetime(df.purchase_date).dt.year

train_raw = df[df.year < 2015].reset_index(drop=True)
test_raw = df[df.year >= 2015].reset_index(drop=True)

# 1. Dummy Transformer

<font color='red'>Assignment:</font> Build a **Dummy Value Transformer** and wrap it up with **LinearRegression** and **RandomForestRegressor** respectively to predict **price** using **product_type**. Compare the performance of these two models.

In [8]:
class Dummy_Transformer(object):
    
    def __init__(self):
        self.keys = set()    
    
    def fit(self, X, y=None):
        self.keys = self.get_keys(X)
    
    def transform(self, X, y=None):
        res = {}
        for key in self.keys:
            res[key] = [0]*len(X)    
        for i, item in enumerate(X):
            if type(item) == str:
                my_str = item.split(",")
                for str1 in my_str:
                    if str1 in self.keys:
                        res[str1][i] = 1
        return pd.DataFrame(res)
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
    
    def get_keys(self, X):
        return_set = set()
        for item in X:
            if type(item) == str:
                my_str = item.split(",")
                for str1 in my_str:
                    return_set.add(str1)
        return return_set    

In [12]:
y_train = train_raw['price'].map(lambda x: x if type(x) == float else float(x.strip('$').replace(',', '')))
y_test = test_raw['price'].map(lambda x: x if type(x) == float else float(x.strip('$').replace(',', '')))

In [48]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

steps = [('dt', Dummy_Transformer()),
         ('lr', LinearRegression())]

model = Pipeline(steps)

model.fit(train_raw['product_type'], y_train)

Pipeline(steps=[('dt',
                 <__main__.Dummy_Transformer object at 0x000001C86C2BFA30>),
                ('lr', LinearRegression())])

In [50]:
y_train_pred = model.predict(train_raw['product_type'])
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_train, y_train_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_train, y_train_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_train, y_train_pred)))

y_test_pred = model.predict(test_raw['product_type'])
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_test, y_test_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_test, y_test_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_test, y_test_pred)))

train MAE: 1.30e+05
train MSE: 3.74e+10
train R2: 0.123
train MAE: 1.48e+05
train MSE: 6.79e+10
train R2: 0.213


In [51]:
steps = [('dt', Dummy_Transformer()),
         ('rfr', RandomForestRegressor(max_depth=6))]

model = Pipeline(steps)

model.fit(train_raw['product_type'], y_train)

Pipeline(steps=[('dt',
                 <__main__.Dummy_Transformer object at 0x000001C86C2D8A90>),
                ('rfr', RandomForestRegressor(max_depth=6))])

In [52]:
y_train_pred = model.predict(train_raw['product_type'])
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_train, y_train_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_train, y_train_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_train, y_train_pred)))

y_test_pred = model.predict(test_raw['product_type'])
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_test, y_test_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_test, y_test_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_test, y_test_pred)))

train MAE: 1.27e+05
train MSE: 3.59e+10
train R2: 0.157
train MAE: 1.47e+05
train MSE: 6.41e+10
train R2: 0.256


# 2. Frequence Transformer

<font color='red'>Assignment:</font> Repeat above steps using **Frequence Transformer**.

In [17]:
class Frequence_Transformer(object):
    
    def __init__(self):
        self.keys = set()    
    
    def fit(self, X, y=None):
        self.keys = self.get_keys(X)
    
    def transform(self, X, y=None):
        res = {}
        for key in self.keys:
            res[key] = [0]*len(X)    
        for i, item in enumerate(X):
            if type(item) == str:
                my_str = item.split(",")
                for str1 in my_str:
                    if str1 in self.keys:
                        res[str1][i] = 1
        df = pd.DataFrame(res)
        for col in list(df.columns):
            freq = df[col].sum()
            df[col] = df[col].map(lambda x: x*freq)
        return df
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
    
    def get_keys(self, X):
        return_set = set()
        for item in X:
            if type(item) == str:
                my_str = item.split(",")
                for str1 in my_str:
                    return_set.add(str1)
        return return_set   

In [19]:
ft=Frequence_Transformer()
ft_rlt=ft.fit_transform(train_raw['product_type'])
ft_rlt.head(10)

Unnamed: 0,Q,E,O,I,C,W,D,N,A,P,...,B,K,V,T,J,H,G,F,L,U
0,90,0,0,0,0,0,0,0,0,0,...,181,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,992,0,0,0,...,0,0,0,0,186,0,0,0,0,0
2,0,0,0,0,0,0,992,0,0,0,...,0,0,0,0,186,0,0,0,0,0
3,0,0,0,0,0,0,992,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,520,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,427,0,...,181,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,520,0,0,0,0
8,0,0,0,0,0,0,992,0,0,0,...,0,99,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,290,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
steps = [('dt', Dummy_Transformer()),
         ('lr', LinearRegression())]

model = Pipeline(steps)

model.fit(train_raw['product_type'], y_train)

y_train_pred = model.predict(train_raw['product_type'])
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_train, y_train_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_train, y_train_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_train, y_train_pred)))

y_test_pred = model.predict(test_raw['product_type'])
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_test, y_test_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_test, y_test_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_test, y_test_pred)))

train MAE: 1.30e+05
train MSE: 3.74e+10
train R2: 0.123
train MAE: 1.48e+05
train MSE: 6.79e+10
train R2: 0.213


In [54]:
steps = [('dt', Dummy_Transformer()),
         ('rfr', RandomForestRegressor(max_depth=6))]

model = Pipeline(steps)

model.fit(train_raw['product_type'], y_train)

y_train_pred = model.predict(train_raw['product_type'])
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_train, y_train_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_train, y_train_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_train, y_train_pred)))

y_test_pred = model.predict(test_raw['product_type'])
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_test, y_test_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_test, y_test_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_test, y_test_pred)))

train MAE: 1.27e+05
train MSE: 3.59e+10
train R2: 0.157
train MAE: 1.47e+05
train MSE: 6.39e+10
train R2: 0.259


# 3. Label Encoder

In [44]:
from sklearn.preprocessing import LabelEncoder

<font color='red'>Question:</font> What does **LabelEncoder** do?

In [45]:
# LabelEncoder can be used to normalize numerical labels or be used to transform non-numeerical labels to numerical labels.

**Example:** The **LabelEncoder** module in **sklearn** can't be used in a **Pipeline** dicrectly, because it only takes one variable (X) as input. We can build a new module which is a **child class** of the **sklearn** module for **Pipeline**. You don't have to understand it now.

In [46]:
class newLabelEncoder(LabelEncoder):
    
    def fit(self, X, y=None):
        return super(newLabelEncoder, self).fit(X)
    
    def transform(self, X, y=None):
        return pd.DataFrame({'label': super(newLabelEncoder, self).transform(X)})
    
    def fit_transform(self, X, y=None):
        return pd.DataFrame({'label': super(newLabelEncoder, self).fit_transform(X)})

In [67]:
ft=newLabelEncoder()
ft_rlt=ft.fit_transform(train_raw['product_type'])
ft_rlt.head(10)

Unnamed: 0,label
0,40
1,23
2,23
3,9
4,17
5,46
6,2
7,17
8,26
9,31


<font color='red'>Assignment:</font> Wrap up **newLabelEncoder** with **LinearRegression** and **RandomForestRegressor** respectively to predict **price** using **product_type**. Compare the performance of these two models.

In [55]:
steps = [('nle', newLabelEncoder()),
         ('lr', LinearRegression())]

model = Pipeline(steps)

model.fit(train_raw['product_type'], y_train)

y_train_pred = model.predict(train_raw['product_type'])
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_train, y_train_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_train, y_train_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_train, y_train_pred)))

y_test_pred = model.predict(test_raw['product_type'])
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_test, y_test_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_test, y_test_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_test, y_test_pred)))

train MAE: 1.36e+05
train MSE: 4.25e+10
train R2: 0.002
train MAE: 1.69e+05
train MSE: 8.65e+10
train R2: -0.003


In [66]:
steps = [('nle', newLabelEncoder()),
         ('rfr', RandomForestRegressor(max_depth=6))]


model = Pipeline(steps)

model.fit(train_raw['product_type'], y_train)

y_train_pred = model.predict(train_raw['product_type'])
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_train, y_train_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_train, y_train_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_train, y_train_pred)))

y_test_pred = model.predict(test_raw['product_type'])
print('train MAE: {0:.2e}'.format(mean_absolute_error(y_test, y_test_pred)))
print('train MSE: {0:.2e}'.format(mean_squared_error(y_test, y_test_pred)))
print('train R2: {0:.3f}'.format(r2_score(y_test, y_test_pred)))

train MAE: 1.26e+05
train MSE: 3.58e+10
train R2: 0.159
train MAE: 1.46e+05
train MSE: 6.48e+10
train R2: 0.248


<font color='red'>Question:</font> Do you think it is a good idea to use Lable Encoder in linear regression? Why does it work for tree-based models?

In [None]:
# It is not a good idea to use in linear regression because the label encoder maps the order of the original labels to the 
# numerical labels so the relation between X (labels) and Y is not necessary linear. Tree-based models works here because it
# does not require linear relationship between X and Y.