# Demand forecast analysis
---
- integrate multiple files into one
- fill or drop the missing observations
- identify any trends and show it in graph
---

### Import libraries and define universal variables

In [11]:
import numpy as np
import pandas as pd
import re
import math
import matplotlib.pyplot as plt
import codecs
from lib import utility as util
from functools import partial

### Import data
*** please don't proceec to the next cell before data tables is shown below the cell.***

In [12]:
# file paths
size_file = "../raw_data/サイズ表記.xlsx"
category_file = "../raw_data/商品分類表_180410.xlsx"
color_file = "../raw_data/商品カラー略号表180407.xlsx"
sale_file="../raw_data/売上データ2014-2017.csv"
subcategory_file = "../processed_data/subcategory.csv"
material_file = "../processed_data/material.csv"
design_file = "../processed_data/design.csv"

with codecs.open(sale_file, "r", "Shift-JIS", "ignore") as file:  # import UTF8 based csv file 
    sales_chunk = pd.read_table( file,
                                 delimiter=",",
                                 parse_dates=['sales_date'],
                                 header=0,
                                 iterator=True,
                                 chunksize=1000,
                                 na_values={'sales_proceed': 0, 'gross_profit': 0, 'area_id': 0},
                                 index_col=['product_id'],
                                 names=['sales_id', 
                                        'product_id',
                                        'product_name',
                                        'retail_price',
                                        'sales_proceeds',
                                        'gross_profit', 
                                        'number_of_sales',
                                        'sales_date',
                                        'raw_cost', 
                                        'most_recent_purchase_date',
                                        'most_recent_order_date',
                                        'number_of_inventory-effective_number',
                                        'manufacture_country_id',
                                        'manufacture_country'] )
    
    sales_df = pd.concat(sales_chunk)  # This makes all chunks saved in memory
    print("the length of sales data: {}".format(len(sales_df)))
    

# import excel files
# for size data
size_df = pd.read_excel( size_file, sheet_name='Q_最終メモ', header=0, index_col=0 )
size_df.columns = ['size','quality','brand']
size_df.index.name = 'product_id'
print(size_df.head(10))

# for category data
category_df= pd.read_excel(category_file,  sheet_name='Sheet1', index_col=0 )
category_df.columns = ['product_name',
                       'material_type_id',
                       'material_type',
                       'manufacture_country_id',
                       'manufacture_country',
                       'category_id',
                       'category']
category_df.index.name = 'product_id'
print(category_df.head(2))

# for color data
color_df = pd.read_excel(color_file, sheet_name='商品カラー略号表180407', index_col=1).iloc[:,:2]
color_df.columns = ['english_name','japanese_name']
color_df.index.name = 'color_id'
print(color_df.head(2))

subcategory_df = pd.read_csv(subcategory_file, index_col=0, header=0)
print(subcategory_df.head(2))

material_df = pd.read_csv(material_file, index_col=0,header=0)
print(material_df.head(2))

design_df = pd.read_csv(design_file,index_col=0,header=0)
print(design_df.head(2))


the length of sales data: 876339
           size quality brand
product_id                   
000001      NaN     NaN   NaN
011007BE    NaN     NaN   NaN
011007GN    NaN     NaN   NaN
011007LB    NaN     NaN   NaN
011007LG    NaN     NaN   NaN
011007NT    NaN     NaN   NaN
011007NV    NaN     NaN   NaN
011007OR    NaN     NaN   NaN
011007PK    NaN     NaN   NaN
011007PU    NaN     NaN   NaN
               product_name  material_type_id material_type  \
product_id                                                    
AAH080-C    アルファベットチャームＡＤ                 0.0           その他   
AAH080-E    アルファベットチャームＡＤ                 0.0           その他   

            manufacture_country_id manufacture_country  category_id category  
product_id                                                                    
AAH080-C                      13.0                国内仕入         60.0   アクセサリー  
AAH080-E                      13.0                国内仕入         60.0   アクセサリー  
         english_name japanese_name
co

### Integrate all dataframe into one

In [13]:
# merging all loaded data into one dataframe
integrated_df = sales_df.loc[:, ['sales_id','product_name','retail_price','sales_proceeds','gross_profit','number_of_sales','sales_date','raw_cost','manufacture_country']]
integrated_df['category'], integrated_df['material_type'] = category_df['category'], category_df['material_type']
integrated_df['size'],integrated_df['quality'],integrated_df['brand'] = size_df['size'],size_df['quality'],size_df['brand']
integrated_df.head(2)


Unnamed: 0_level_0,sales_id,product_name,retail_price,sales_proceeds,gross_profit,number_of_sales,sales_date,raw_cost,manufacture_country,category,material_type,size,quality,brand
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
103591,20239746,アンティークシダー　フォールディング テーブル　（Ｌ）,18000,9900,6770,1,2014-08-01,3130.16,中国　部門１,インテリア雑貨,木製品,８８×６０×７２ｃｍ,,
102862,20239747,アンティークシダー　カードホルダー,2200,1210,742,1,2014-08-01,507.62,中国　部門１,文具・デスク用品,木製品,１８．５×２４．５×８．５ｃｍ,（ＣＥＤＡＲ）,


### Extracting data from values
- extract color information from product_id 
- extract sub-category and size_letter information from product_name
- extract wholesale price from retail_price, sales_proceed, raw_cost

#### Extract color name from product_id by comparing with color_df

In [14]:
def get_colors_from_product_ids(color_df, lang, product_id):
    # the last two characters of product_id matches with color_id
    # if it doesn't match, add to color list as no color 
    color_id = product_id[-2:] 
    color = None
    if color_id in color_df.index:
        color = color_df.loc[color_id,lang]
    return color

color_chunks = util.get_data_in_chunks(integrated_df.index.values, partial(get_colors_from_product_ids, color_df, 'japanese_name'))

# Adding extracted color series as a column
integrated_df['color'] = util.convert_chunks_to_list(color_chunks)
integrated_df[13:15]

Extracting data currently: 50000
Extracting data currently: 100000
Extracting data currently: 150000
Extracting data currently: 200000
Extracting data currently: 250000
Extracting data currently: 300000
Extracting data currently: 350000
Extracting data currently: 400000
Extracting data currently: 450000
Extracting data currently: 500000
Extracting data currently: 550000
Extracting data currently: 600000
Extracting data currently: 650000
Extracting data currently: 700000
Extracting data currently: 750000
Extracting data currently: 800000
Extracting data currently: 850000
876339 data has been extracted


Unnamed: 0_level_0,sales_id,product_name,retail_price,sales_proceeds,gross_profit,number_of_sales,sales_date,raw_cost,manufacture_country,category,material_type,size,quality,brand,color
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
190094OR,20239751,ピアスセット　ジェルフラワー＆スワロフスキー,2800,3360,2066,2,2014-08-01,647.0,韓国,アクセサリー,金属製品,フラワー：φ１．５ｃｍ スワロ：φ０．５ｃｍ,ＺＩＮＣ ＥＰＯＸＹ ＩＭＩＴＡＴＩＯＮ　ＰＥＡＲＬＳＷＡＲＯＶＳＫＩ ＰＯＳＴ：ＳＩ...,＜ＶＬＶ＞,オレンジ
190094WH,20239751,ピアスセット　ジェルフラワー＆スワロフスキー,2800,3360,2066,2,2014-08-01,647.0,韓国,アクセサリー,金属製品,フラワー：φ１．５ｃｍ スワロ：φ０．５ｃｍ,ＺＩＮＣ ＥＰＯＸＹ ＩＭＩＴＡＴＩＯＮ　ＰＥＡＲＬＳＷＡＲＯＶＳＫＩ ＰＯＳＴ：ＳＩ...,＜ＶＬＶ＞,ホワイト


#### Extract subcategory from product_name by comparing with subcategory_df

In [15]:
# getting generator list which contains extracted subcategory and size letter values
product_names = util.convert_col_to_list(integrated_df['product_name'])
subcategory_chunks = util.get_data_in_chunks(product_names, partial(util.find_matched_group_and_name_from_df, subcategory_df))
subcategory_list = util.convert_chunks_to_list(subcategory_chunks)

#adding extracted data to integrated dataframe as new columns
integrated_df['subcategory_group'] = util.get_column(subcategory_list,0)
integrated_df['subcategory'] = util.get_column(subcategory_list,1)

Extracting data currently: 50000
Extracting data currently: 100000
Extracting data currently: 150000
Extracting data currently: 200000
Extracting data currently: 250000
Extracting data currently: 300000
Extracting data currently: 350000
Extracting data currently: 400000
Extracting data currently: 450000
Extracting data currently: 500000
Extracting data currently: 550000
Extracting data currently: 600000
Extracting data currently: 650000
Extracting data currently: 700000
Extracting data currently: 750000
Extracting data currently: 800000
Extracting data currently: 850000
876339 data has been extracted


#### Extract size_letter info from product_name

In [16]:
def get_size_letter_from_name( name ):

    # extracing out size letter information
    found = re.search("\（.*?\）",name)
    if found:
        size_letter = name[found.start()+1].lower()
    else:
        size_letter = None

    return size_letter

        
# getting generator list which contains extracted subcategory and size letter values
product_names = util.convert_col_to_list(integrated_df['product_name'])
size_letter_chunks = util.get_data_in_chunks(product_names, get_size_letter_from_name)

# adding extracted data to integrated dataframe as new columns
integrated_df['size_letter'] = util.convert_chunks_to_list(size_letter_chunks)

Extracting data currently: 50000
Extracting data currently: 100000
Extracting data currently: 150000
Extracting data currently: 200000
Extracting data currently: 250000
Extracting data currently: 300000
Extracting data currently: 350000
Extracting data currently: 400000
Extracting data currently: 450000
Extracting data currently: 500000
Extracting data currently: 550000
Extracting data currently: 600000
Extracting data currently: 650000
Extracting data currently: 700000
Extracting data currently: 750000
Extracting data currently: 800000
Extracting data currently: 850000
876339 data has been extracted


#### Extract material from product_name and quality by comparing with material_df
- [TODO] don't forget to get the numerical vallue after the listed material: ex) cotton 100%

In [17]:
# material list extracted from product name column
product_names = util.convert_col_to_list(integrated_df['product_name'])
pmaterial_chunks = util.get_data_in_chunks(product_names, partial(util.find_matched_group_and_name_from_df, material_df))
pmaterial_list = util.convert_chunks_to_list(pmaterial_chunks)
pmaterial_group = util.get_column(pmaterial_list, 0)
pmaterial = util.get_column(pmaterial_list, 1)

# material list extracted from quality column
qualities = util.convert_col_to_list(integrated_df['quality'])
qmaterial_chunks = util.get_data_in_chunks(qualities, partial(util.find_matched_group_and_name_from_df, material_df))
qmaterial_list = util.convert_chunks_to_list(qmaterial_chunks)
qmaterial_group = util.get_column(qmaterial_list, 0)
qmaterial = util.get_column(qmaterial_list, 1)


material_groups = util.combine_two_lists(qmaterial_group, pmaterial_group)
materials = util.combine_two_lists(qmaterial, pmaterial)            
            
    
# #adding extracted data to integrated dataframe as new columns
integrated_df['material_group'] = material_groups
integrated_df['material'] = materials

Extracting data currently: 50000
Extracting data currently: 100000
Extracting data currently: 150000
Extracting data currently: 200000
Extracting data currently: 250000
Extracting data currently: 300000
Extracting data currently: 350000
Extracting data currently: 400000
Extracting data currently: 450000
Extracting data currently: 500000
Extracting data currently: 550000
Extracting data currently: 600000
Extracting data currently: 650000
Extracting data currently: 700000
Extracting data currently: 750000
Extracting data currently: 800000
Extracting data currently: 850000
876339 data has been extracted
Extracting data currently: 50000
Extracting data currently: 100000
Extracting data currently: 150000
Extracting data currently: 200000
Extracting data currently: 250000
Extracting data currently: 300000
Extracting data currently: 350000
Extracting data currently: 400000
Extracting data currently: 450000
Extracting data currently: 500000
Extracting data currently: 550000
Extracting data cur

#### Check integrated data

In [18]:
integrated_df.head(10)

Unnamed: 0_level_0,sales_id,product_name,retail_price,sales_proceeds,gross_profit,number_of_sales,sales_date,raw_cost,manufacture_country,category,material_type,size,quality,brand,color,subcategory_group,subcategory,size_letter,material_group,material
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
103591,20239746,アンティークシダー　フォールディング テーブル　（Ｌ）,18000,9900,6770,1,2014-08-01,3130.16,中国　部門１,インテリア雑貨,木製品,８８×６０×７２ｃｍ,,,,[テーブル],[テーブル],ｌ,,
102862,20239747,アンティークシダー　カードホルダー,2200,1210,742,1,2014-08-01,507.62,中国　部門１,文具・デスク用品,木製品,１８．５×２４．５×８．５ｃｍ,（ＣＥＤＡＲ）,,,"[カード, ホルダー]","[カード, ホルダー]",,[cedar],[cedar]
104782,20239747,ティーフフィル　カード　ラック,5500,3025,1986,1,2014-08-01,1111.26,中国　部門１,実用小物・消耗品,金属製品,４２×２０×３６ｃｍ,ＩＲＯＮ,,,"[カード, ラック]","[カード, ラック]",,[iron],[iron]
999000,20239747,立替　送料,0,600,600,1,2014-08-01,0.0,,その他,その他,,,,,,,,,
190038,20239751,ネックレス　インターバル　ヴァリエ　ストーン,3800,4560,2592,2,2014-08-01,984.0,韓国,アクセサリー,金属製品,４５ｃｍ,ＳＩＬＶＥＲ９２５（１６－Ｋ　ＧＯＬＤ　ＰＬＡＴＥＤ）　ＰＵ：ＡＭＥＴＨＹＳＴ　Ｌ・ＧＹ：Ｌ...,,,[ネックレス],[ネックレス],,"[アメジスト, goldplated, 藍晶石, オニキス, クォーツ, シルバー]","[amethyst, goldplated, labrado, onyx, quartz, ..."
190039,20239751,ブレスレット　インターバル　ヴァリエ ストーン,3000,3600,2062,2,2014-08-01,769.0,韓国,アクセサリー,金属製品,１８．５ｃｍ,ＳＩＬＶＥＲ９２５（１６－Ｋ　ＧＯＬＤ　ＰＬＡＴＥＤ）　ＰＵ：ＡＭＥＴＨＹＳＴ　Ｌ・ＧＹ：Ｌ...,,,"[ブレス, ブレス]","[ブレス, ブレスレット]",,"[アメジスト, goldplated, 藍晶石, オニキス, クォーツ, シルバー]","[amethyst, goldplated, labrado, onyx, quartz, ..."
190044,20239751,ブレスレット　アレンジ　ウォーターパール,3000,3600,2224,2,2014-08-01,688.0,韓国,アクセサリー,金属製品,１８ｃｍ,ＳＩＬＶＥＲ９２５（１６－Ｋ　ＧＯＬＤ　ＰＬＡＴＥＤ）　ＷＡＴＥＲ　ＰＥＡＲＬ ＊ポーチ付,,,"[ブレス, ブレス]","[ブレス, ブレスレット]",,"[goldplated, pear, pearl, シルバー, パール]","[goldplated, pear, pearl, silver925, パール]"
190046,20239751,ネックレス　ＢＲＡＳＳ　ティップ,4200,5040,3090,2,2014-08-01,975.0,韓国,アクセサリー,金属製品,５０ｃｍ,ＳＩＬＶＥＲ９２５（１６－Ｋ　ＧＯＬＤ　ＰＬＡＴＥＤ）　ＢＲＡＳＳ＝ＰＩＰＥ,,,[ネックレス],[ネックレス],,"[brass, goldplated, シルバー]","[brass, goldplated, silver925]"
190047,20239751,ピアス　ＢＲＡＳＳ　ティップ,2800,3360,2212,2,2014-08-01,574.0,韓国,アクセサリー,金属製品,１．５×４．５ｃｍ,ＳＩＬＶＥＲ９２５（１６－Ｋ　ＧＯＬＤ　ＰＬＡＴＥＤ）　ＢＲＡＳＳ＝ＰＩＰＥ,,,[ピアス],[ピアス],,"[brass, goldplated, シルバー]","[brass, goldplated, silver925]"
190064,20239751,リング　セット　ＬＩＮＥ　１１号,5500,3300,1881,1,2014-08-01,1419.0,韓国,アクセサリー,金属製品,１１号,ＳＩＬＶＥＲ９２５　１６－Ｋ　ＧＯＬＤ　ＰＬＡＴＥＤ ＣＵＢＩＣ　ＺＩＲＣＯＮＩＡ ＊ポ...,＜ＶＬＶ＞,,[リング],[リング],,"[goldplated, シルバー]","[goldplated, silver925]"


### Export integrated dataframe as a csv in dist directory

In [20]:
integrated_df.to_csv('../processed_data/integrated.csv')
