In [9]:
import warnings
warnings.filterwarnings(action='ignore')

from datetime import datetime

from mlxtend.frequent_patterns import association_rules, apriori, fpgrowth
from mlxtend.preprocessing import TransactionEncoder

import sklearn
from sklearn.decomposition import TruncatedSVD

import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus']= False 

import platform
if platform.system() == 'Darwin': 
    plt.style.use('seaborn-darkgrid') 
    rc('font', family = 'AppleGothic') 

In [13]:
from glob import glob
# files = glob("./dataset/supply_chain/*.xls")
# joohyun-nfs/ABC_Food/data/dataset/supply_chain/2020.01.xls
files = glob("../data/dataset/supply_chain/2020.01.xls")

In [14]:
files

['../data/dataset/supply_chain/2020.01.xls']

In [15]:
class DataConnector:
    
    def __init__(self, files):
        
        """
        Importing the dataset.
        Define the data type for preprocessing easily later.
        Put all the data into the dataframe.
        """
        
        print('Connecting')        
        self.df = pd.DataFrame()
        
        for file in files:
            df_raw = pd.read_excel(file,dtype={'일자':str,'매출처':str,'코드':str,'수량': float,'합계': float})
            self.df=pd.concat([self.df, df_raw])
            
    
    def df_preprocess(self):
        
        """
        Preprocess the dataset to available for any algorythm.
        """
        
        df_raw = self.df
        df_raw.fillna("", inplace=True)
        df_raw = df_raw[df_raw["비고"].str.contains("취소")==False]
        df_raw = df_raw[df_raw["비고"].str.contains("반품")==False]
        df_raw.drop(columns=["No","품목명","규격","원산지","단위","단가","금액","세액","비고"], inplace=True)
        df_raw.rename(columns={"일자":"day","매출처":"customer","코드":"product","수량":"quantity", "합계":"aggregate"}, inplace=True)

        df_raw = df_raw[df_raw["quantity"]!=0]
        df_raw = df_raw.drop(df_raw[df_raw["day"]=="소계"].index)
        df_raw = df_raw.drop(df_raw[df_raw["day"]=="합계"].index)
        
        df_raw["day"] = pd.to_datetime(df_raw["day"])
        df_raw["ym"] = df_raw["day"].apply(lambda row : row.strftime("%Y%m"))
        
        df = df_raw.copy()
        
        return df
    

In [16]:
DataConnector(files).df_preprocess()

Connecting


Unnamed: 0,day,customer,product,quantity,aggregate,ym
0,2020-01-01,예수병원[본관],1022496,5.0,55000.0,202001
1,2020-01-01,예수병원[본관],1050170,27.0,175500.0,202001
2,2020-01-01,예수병원[본관],1077916,2.0,6800.0,202001
3,2020-01-01,예수병원[본관],1039183,16.0,224000.0,202001
4,2020-01-01,예수병원[본관],1007792,1.0,8700.0,202001
...,...,...,...,...,...,...
23758,2020-01-31,xx토리앤쿡1호본점임시,1048988,12.0,94320.0,202001
23759,2020-01-31,xx토리앤쿡1호본점임시,1081656,2.0,8184.0,202001
23760,2020-01-31,xx토리앤쿡1호본점임시,1040906,2.0,24800.0,202001
23761,2020-01-31,xx토리앤쿡1호본점임시,1046221,5.0,26600.0,202001
