In [1]:
import numpy as np
import pandas as pd

Processing anonymized sales data

In [None]:
archive_df = pd.read_csv('wine_sales_archive_anonymous.csv')
archive_df['WINE PURCHASED'] = archive_df['WINE PURCHASED'].str.strip()

In [None]:
archive_df.head()

Unnamed: 0.1,Unnamed: 0,BATCH #,DATE,WINE PURCHASED,PRICE,CUST_ID
0,0,101,2019-01-03 0:00,wash. riesling wv,160.0,822
1,1,102,2019-01-03 0:00,n.z. sauv blanc ec,243.0,228
2,2,103,2019-01-04 0:00,cal. white zin vr + res pak,155.0,647
3,3,104,2019-01-04 0:00,n/s sauv blanc vr,5.2,382
4,4,105,2019-01-04 0:00,chil. corazon le18,230.0,811


Split Wine Purchased feature into Wine Kit + Customization

In [None]:
winesplit_df = archive_df['WINE PURCHASED'].str.split(pat='+',n=1, expand=True)
winesplit_df.rename(columns={0:'WINE KIT', 1:'CUSTOMIZATION'},inplace=True)
winesplit_df.head()

Unnamed: 0,WINE KIT,CUSTOMIZATION
0,wash. riesling wv,
1,n.z. sauv blanc ec,
2,cal. white zin vr,res pak
3,n/s sauv blanc vr,
4,chil. corazon le18,


In [None]:
arch_slice_df = archive_df.drop(columns=['Unnamed: 0', 'WINE PURCHASED'])
arch_slice_df.head()

Unnamed: 0,BATCH #,DATE,PRICE,CUST_ID
0,101,2019-01-03 0:00,160.0,822
1,102,2019-01-03 0:00,243.0,228
2,103,2019-01-04 0:00,155.0,647
3,104,2019-01-04 0:00,5.2,382
4,105,2019-01-04 0:00,230.0,811


In [None]:
archive_exp_df = pd.concat([winesplit_df, arch_slice_df],axis=1)
archive_exp_df = archive_exp_df[['DATE', 'WINE KIT', 'PRICE', 'CUST_ID', 'BATCH #', 'CUSTOMIZATION']]
archive_exp_df.head()

Unnamed: 0,DATE,WINE KIT,PRICE,CUST_ID,BATCH #,CUSTOMIZATION
0,2019-01-03 0:00,wash. riesling wv,160.0,822,101,
1,2019-01-03 0:00,n.z. sauv blanc ec,243.0,228,102,
2,2019-01-04 0:00,cal. white zin vr,155.0,647,103,res pak
3,2019-01-04 0:00,n/s sauv blanc vr,5.2,382,104,
4,2019-01-04 0:00,chil. corazon le18,230.0,811,105,


In [None]:
kit_valcounts = archive_exp_df['WINE KIT'].value_counts()
kit_valcounts.to_csv('kit_valcounts.csv')
archive_exp_df.to_csv('wine_sales_exp.csv')

Use the valcounts file to find entries that need + added to identify customizations in the archive, as well as any other entries that do not fit.

In [18]:
#Run this cell to refresh the expanded archive dataframe and kit valcounts with any excel-cleaned csv.
archive_exp_df = pd.read_csv('wine_sales_exp.csv')
archive_exp_df['WINE KIT'] = archive_exp_df['WINE KIT'].str.strip()
archive_exp_df.to_csv('wine_sales_exp.csv')

kit_valcounts = archive_exp_df['WINE KIT'].value_counts()
kit_valcounts.to_csv('kit_valcounts.csv')

Expand wine kit feature into origin + wine style + product line features. Can separate origin and product line by splitting on the first and last spaces in the wine kit.

In [27]:
archive_exp_df = pd.read_csv('wine_sales_exp.csv')

archive_exp_df[['ORIGIN','WINE KIT']] = archive_exp_df['WINE KIT'].str.split(" ", n=1, expand=True)
archive_exp_df[['WINE STYLE','PRODUCT LINE']] = archive_exp_df['WINE KIT'].str.rsplit(" ", n=1, expand=True)

archive_exp_df.drop({'Unnamed: 0','WINE KIT'},axis=1,inplace=True)
archive_exp_df['CUSTOMIZATION'].fillna('None',inplace=True)

archive_exp_df['ORIGIN'].value_counts().to_csv('origin_valcounts.csv')
archive_exp_df['WINE STYLE'].value_counts().to_csv('winestyle_valcounts.csv')
archive_exp_df['PRODUCT LINE'].value_counts().to_csv('productline_valcounts.csv')

archive_exp_df.to_csv('wine_sales_expanded.csv')