In [16]:

import os
import sys
from IPython.display import Markdown, display, Image
import numpy as np
import pandas as pd
import random
import math
import dvc.api
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from plotsClass import Plot
from logger import App_Logger
from sklearn import preprocessing
import mlflow
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error 
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')
logger= App_Logger("Reading.log").get_app_logger()

sns.set()

In [18]:
def read_csv(csv_path, missing_values=[]):
        try:
            df = pd.read_csv(csv_path, na_values=missing_values)
            print("file read as csv")
            logger.info(f"file read as csv from {csv_path}")
            return df
        except FileNotFoundError:
            print("file not found")
            logger.error(f"file not found, path:{csv_path}")

In [31]:

df_train_store = read_csv("../data/train_store.csv")
df_test = read_csv("../data/test.csv")

file read as csv
file read as csv


In [32]:
df_train_store.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Month,Year,SalesperCustomer
0,1,5,2015-07-31,5263,555,1,1,0,1,c,...,1270.0,9.0,2008.0,0,0.0,0.0,0,7,2015,9.482883
1,2,5,2015-07-31,6064,625,1,1,0,1,a,...,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",7,2015,9.7024
2,3,5,2015-07-31,8314,821,1,1,0,1,a,...,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",7,2015,10.126675
3,4,5,2015-07-31,13995,1498,1,1,0,1,c,...,620.0,9.0,2009.0,0,0.0,0.0,0,7,2015,9.342457
4,5,5,2015-07-31,4822,559,1,1,0,1,a,...,29910.0,4.0,2015.0,0,0.0,0.0,0,7,2015,8.626118


In [34]:
class TransformingTrainStoreData:
   
    def __init__(self):
        pass

    def to_category(self,df):    
        df["Open"] = df["Open"].astype("category")
        df["DayOfWeek"] = df["Open"].astype("category")
        df["Promo"] = df["Promo"].astype("category")
        df["StateHoliday"] = df["StateHoliday"].astype("category")
        df["SchoolHoliday"] = df["SchoolHoliday"].astype("category")
        df['StateHoliday'] = df['StateHoliday'].astype("str").astype("category")
        df["StoreType"] = df["StoreType"].astype("category")
        df["Assortment"] = df["Assortment"].astype("category")
        df["CompetitionOpenSinceMonth"] = df["CompetitionOpenSinceMonth"].astype("category")
        df["CompetitionOpenSinceYear"] = df["CompetitionOpenSinceYear"].astype("category")
        df["Promo2"] = df["Promo2"].astype("category")
        df["Promo2SinceYear"] = df["Promo2SinceYear"].astype("category")
        df["PromoInterval"] = df["PromoInterval"].astype("category")
        df['Year'] = df['Year'].astype("category")
        df['Month'] = df['Month'].astype("category")
        return df
    
        
    def convert_to_datetime(self, df):  
        try:
            df['Date'] = pd.to_datetime(df_train_store['Date'])
            return df
        except:
            pass
    
    def sort_by_date(self, df):
        return df.sort_values(by=["Date"], ascending=False)     
    
        
    def Transformed(self, df):
        df = self.to_category(df)
        df = self.convert_to_datetime(df)

        return df

In [21]:
class ExtractingCOlumns:

    def __init__(self):
        pass
   
    # Let's get Days from Date and delete Date since we already have its Year and Month:
    def transform_date(self, df):
        df['Day']=df.Date.dt.day
        df['Day'] = df['Day'].astype("category")
        del df["Date"]
        return df
    
    def to_month_category(self, df):
       df["Monthcategory"] = df["Day"].apply(lambda x: 'BegMonth' if x < 11 else ('Midmonth' if x<21 else 'EndMonth'))
       return df
    def add_weekday_col(self, df):
      
        df["Weekends"] = df["DayOfWeek"].apply(lambda x: 1 if x > 5 else 0)
        df["Weekdays"] = df["DayOfWeek"].apply(lambda x: 1 if x <= 5 else 0)
        return df
    def process(self, df): 
        df = self.transform_date(df)
        df = self.add_weekday_col(df)
        df = self.to_month_category(df)
        
        return df

In [28]:
class Preprocess:
    
    def __init__(self):
        pass
    
    def encode_train_store_data(self, df):
        
        StateHolidayEncoder = preprocessing.LabelEncoder()
        DayInMonthEncoder = preprocessing.LabelEncoder()
        StoreTypeEncoder = preprocessing.LabelEncoder()
        AssortmentEncoder = preprocessing.LabelEncoder()
        PromoIntervalEncoder = preprocessing.LabelEncoder()
        MonthcategoryEncoder = preprocessing.LabelEncoder()

        df['StateHoliday'] = StateHolidayEncoder.fit_transform(df['StateHoliday'])
        df['StoreType'] = StoreTypeEncoder.fit_transform(df['StoreType'])
        df['Assortment'] = AssortmentEncoder.fit_transform(df['Assortment'])
        df['PromoInterval'] = PromoIntervalEncoder.fit_transform(df['PromoInterval'])
        df['Monthcategory'] = MonthcategoryEncoder.fit_transform(df['Monthcategory'])

        return df
    def process(self, df):
        df = self.encode_train_store_data(df)        
        return df

In [35]:
df_train_store = TransformingTrainStoreData().Transformed(df_train_store)
df_train_store = ExtractingCOlumns().process(df_train_store)
df_train_store = Preprocess().process(df_train_store)