# Data Segmentation Workbook

## Purpose
Segment data into 10 tables, 1 for store combination.


# Setup

## Library import
We import all the required Python libraries

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Visualizations
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
plotly.offline.init_notebook_mode(connected=True)


# Data import
We retrieve all the required data for the analysis.

In [6]:
sales = pd.read_csv('../data/raw/walmart_sales_data/sales_train_evaluation.csv')
sales.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,...,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,0,3,5,0,0,1,1,0,2,1,2,2,1,0,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,2,1,0,0,0,0,2,1,3,0,0,1,0,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,4,1,6,4,0,0,0,2,2,4,2,1,1,1,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,3,1,0,3,2,3,1,1,3,2,3,2,2,2,2,0,0,0,2,1,0,0,2,1,0


In [9]:
sales.dtypes

id          object
item_id     object
dept_id     object
cat_id      object
store_id    object
             ...  
d_1937       int64
d_1938       int64
d_1939       int64
d_1940       int64
d_1941       int64
Length: 1947, dtype: object

# Data processing

In [45]:
states = ['CA', 'WI', 'TX']
cats = ['HOBBIES', 'HOUSEHOLD', 'FOODS']
stores = [1, 2, 3, 4]
all_store_sales = pd.DataFrame()
for state in states:
    for store in stores:
        df = pd.DataFrame()
        store_sales = sales[(sales['store_id']==f'{state}_{store}')]
        if len(store_sales) > 0:
            store_sales = pd.DataFrame(store_sales.select_dtypes(include='int64').sum())
            store_sales.columns = [f'{state}_{store}']
            all_store_sales = pd.concat([all_store_sales, store_sales], axis=1, ignore_index=False)

all_store_sales

Unnamed: 0,CA_1,CA_2,CA_3,CA_4,WI_1,WI_2,WI_3,TX_1,TX_2,TX_3
d_1,4337,3494,4739,1625,2704,2256,4038,2556,3852,3030
d_2,4155,3046,4827,1777,2194,1922,4198,2687,3937,3006
d_3,2816,2121,3785,1386,1562,2018,3317,1822,2731,2225
d_4,3051,2324,4232,1440,1251,2522,3211,2258,2954,2169
d_5,2630,1942,3817,1536,2,1175,2132,1694,2492,1726
...,...,...,...,...,...,...,...,...,...,...
d_1937,3995,3789,5375,2519,3242,4533,3268,3147,3518,3710
d_1938,4136,4037,5580,2544,3478,4628,3398,2748,3126,3288
d_1939,4433,4751,5542,2704,3813,4880,4126,3664,4249,4390
d_1940,5764,7120,7073,3146,5002,5213,4519,4167,4802,4712


In [52]:
import plotly.express as px

fig = px.line(all_store_sales, x=all_store_sales.index, y=all_store_sales.columns[0:])
fig.update_traces(opacity=0.5)
fig.show()