In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from datetime import date
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
# To consider Brazilian calendar and hollidays
!pip install workalendar
from workalendar.america import Brazil
cal = Brazil()

In [12]:
df=pd.read_csv('../input/olist-public-dataset/olist_public_dataset_v2.csv')

In [13]:
df['order_purchase_timestamp']=pd.to_datetime(df['order_purchase_timestamp'])
df['order_aproved_at']=pd.to_datetime(df.order_aproved_at).dt.date
df['order_estimated_delivery_date']=pd.to_datetime(df.order_estimated_delivery_date).dt.date
df['order_delivered_customer_date']=pd.to_datetime(df.order_delivered_customer_date).dt.date
df.head()

In [14]:
trans=pd.read_csv('../input/python-for-ds-datasets-venkat/Brazil_Ecom_data_All_Data_Files/product_category_name_translation.csv')
trans.head()

In [15]:
df=df.merge(trans,on='product_category_name').drop('product_category_name',axis=1)

In [20]:
df[['order_status', 'order_products_value',
       'order_freight_value', 'order_items_qty', 'order_sellers_qty',
       'order_purchase_timestamp', 'order_aproved_at',
       'order_estimated_delivery_date', 'order_delivered_customer_date', 'customer_city', 'customer_state',
       'customer_zip_code_prefix', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_id', 'review_score', 'review_comment_title',
       'review_comment_message', 'review_creation_date',
       'review_answer_timestamp', 'product_category_name_english']].head()

In [22]:
df.review_score.value_counts().sort_index()

In [23]:
orders = df[['order_status', 'order_products_value',
                 'order_freight_value', 'order_items_qty', 'order_sellers_qty',
                 'order_purchase_timestamp', 'order_aproved_at', 'order_estimated_delivery_date', 
                 'order_delivered_customer_date', 'customer_state', 
                 'product_category_name_english', 'product_name_lenght', 'product_description_lenght', 
                 'product_photos_qty', 'review_score']]

In [24]:
orders.info()

In [26]:
orders

In [112]:
round((orders['review_score'].value_counts() / len(orders['review_score']))*100).sort_index()

In [113]:
from sklearn.model_selection import StratifiedShuffleSplit

strat=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_i,test_i in strat.split(orders,orders['review_score']):
    strat_train=orders.loc[train_i]
    strat_test=orders.loc[test_i]

In [114]:
plt.subplots(figsize=(12,5))
sns.heatmap(strat_train.corr(),annot=True)

In [115]:
strat_train.head()

In [116]:
strat_train['order_aproved_at']

In [117]:
sample=strat_train.copy()

In [128]:
from sklearn.base import BaseEstimator, TransformerMixin

class AttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass    
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        df = X.copy()
        
        # Calculate the time between the actual and estimated delivery date. If negative was delivered early, if positive was delivered late.
        df['is_late'] = df.order_delivered_customer_date > df.order_estimated_delivery_date
        
        # Calculate the average product value.
        df['average_product_value'] = df.order_products_value / df.order_items_qty

        # Calculate the total order value
        df['total_order_value'] = df.order_products_value + df.order_freight_value
        
        # Calculate the order freight ratio.
        df['order_freight_ratio'] = df.order_freight_value / df.order_products_value
        
        # Calculate the order freight ratio.
        df['purchase_dayofweek'] = df.order_purchase_timestamp.dt.dayofweek
                       
        # With that we can remove the timestamps from the dataset
        cols2drop = ['order_purchase_timestamp', 'order_aproved_at', 'order_estimated_delivery_date', 
                     'order_delivered_customer_date']
        df.drop(cols2drop, axis=1, inplace=True)
        
        return df

In [129]:
attr_adder = AttributesAdder()
feat_eng = attr_adder.transform(strat_train)
feat_eng.head(3)

In [130]:
corr_matrix = feat_eng.corr()
corr_matrix['review_score'].sort_values(ascending=False)

In [131]:
feat_eng.info()

In [134]:
orders_features = strat_train.drop('review_score', axis=1)
orders_labels = strat_train['review_score'].copy()

In [145]:
orders_features_test=strat_test.drop('review_score',axis=1)
orders_labels_test=strat_test['review_score'].copy()

In [135]:
cat_attribs = ['order_status', 'customer_state', 'product_category_name_english']
num_attribs = orders_features.drop(cat_attribs, axis=1).columns

In [136]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.attribute_names]

In [137]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# for now we wont work with categorical data. Planning to add it on next releases
num_pipeline = Pipeline([('selector', DataFrameSelector(num_attribs)),
                         ('attribs_adder', AttributesAdder()),
                         ('std_scaller', StandardScaler())
                        ])

In [138]:
orders_features_prepared = num_pipeline.fit_transform(orders_features)
orders_features_prepared

In [149]:
orders_features_test=num_pipeline.transform(orders_features_test)

In [140]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

forest_reg = RandomForestRegressor()
forest_reg.fit(orders_features_prepared, orders_labels)

predictions = forest_reg.predict(orders_features_prepared)
forest_mse = mean_squared_error(orders_labels, predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [144]:
forest_reg.score(orders_features_prepared,orders_labels)

In [152]:
predictions1 = forest_reg.predict(orders_features_test)
forest_mse = mean_squared_error(orders_labels_test, predictions1)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [153]:
forest_reg.score(orders_features_test,orders_labels_test)

In [142]:
some_data = orders_features.iloc[:8]
some_labels = orders_labels.iloc[:8]
some_data_prepared = num_pipeline.transform(some_data)

In [143]:
print('Predicted: {} \n Labels: {}'.format(list(forest_reg.predict(some_data_prepared)), list(some_labels.values)))