# Project 3

#### Importing Packages and Loading the Data

In [1]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("Iowa_Liquor_sales_sample_10pct.csv")
df.head()

Unnamed: 0,Date,Store Number,City,Zip Code,County Number,County,Category,Category Name,Vendor Number,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,11/04/2015,3717,SUMNER,50674,9.0,Bremer,1051100.0,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,$4.50,$6.75,12,$81.00,9.0,2.38
1,03/02/2016,2614,DAVENPORT,52807,82.0,Scott,1011100.0,BLENDED WHISKIES,395,27605,Tin Cup,750,$13.75,$20.63,2,$41.26,1.5,0.4
2,02/11/2016,2106,CEDAR FALLS,50613,7.0,Black Hawk,1011200.0,STRAIGHT BOURBON WHISKIES,65,19067,Jim Beam,1000,$12.59,$18.89,24,$453.36,24.0,6.34
3,02/03/2016,2501,AMES,50010,85.0,Story,1071100.0,AMERICAN COCKTAILS,395,59154,1800 Ultimate Margarita,1750,$9.50,$14.25,6,$85.50,10.5,2.77
4,08/18/2015,3654,BELMOND,50421,99.0,Wright,1031080.0,VODKA 80 PROOF,297,35918,Five O'clock Vodka,1750,$7.20,$10.80,12,$129.60,21.0,5.55


In [3]:
df.shape

(270955, 18)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270955 entries, 0 to 270954
Data columns (total 18 columns):
Date                     270955 non-null object
Store Number             270955 non-null int64
City                     270955 non-null object
Zip Code                 270955 non-null object
County Number            269878 non-null float64
County                   269878 non-null object
Category                 270887 non-null float64
Category Name            270323 non-null object
Vendor Number            270955 non-null int64
Item Number              270955 non-null int64
Item Description         270955 non-null object
Bottle Volume (ml)       270955 non-null int64
State Bottle Cost        270955 non-null object
State Bottle Retail      270955 non-null object
Bottles Sold             270955 non-null int64
Sale (Dollars)           270955 non-null object
Volume Sold (Liters)     270955 non-null float64
Volume Sold (Gallons)    270955 non-null float64
dtypes: float64(4), int64(

## Cleaning the data

Task List:
* Remove "$" prices from characters and convert values to floats.
* Convert dates to pandas datetime objects
* Convert category floats to integers
* Drop or fill in bad values

##### Remove "$" prices from characters and convert values to floats

In [5]:
dollar_removal_cols = ["State Bottle Cost", "State Bottle Retail", "Sale (Dollars)"]

for col in dollar_removal_cols:
    df[col] = df[col].apply(lambda x: float(x[1:]))

In [6]:
df.head(2)

Unnamed: 0,Date,Store Number,City,Zip Code,County Number,County,Category,Category Name,Vendor Number,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,11/04/2015,3717,SUMNER,50674,9.0,Bremer,1051100.0,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,4.5,6.75,12,81.0,9.0,2.38
1,03/02/2016,2614,DAVENPORT,52807,82.0,Scott,1011100.0,BLENDED WHISKIES,395,27605,Tin Cup,750,13.75,20.63,2,41.26,1.5,0.4


##### Convert dates to pandas datetime objects

In [7]:
df["Date"] = pd.to_datetime(df["Date"], format="%m/%d/%Y")
df.head(2)

Unnamed: 0,Date,Store Number,City,Zip Code,County Number,County,Category,Category Name,Vendor Number,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,2015-11-04,3717,SUMNER,50674,9.0,Bremer,1051100.0,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,4.5,6.75,12,81.0,9.0,2.38
1,2016-03-02,2614,DAVENPORT,52807,82.0,Scott,1011100.0,BLENDED WHISKIES,395,27605,Tin Cup,750,13.75,20.63,2,41.26,1.5,0.4


In [8]:
df["Date"].max()

Timestamp('2016-03-31 00:00:00')

In [9]:
df["Date"].min()

Timestamp('2015-01-05 00:00:00')

##### Drop or replace 'bad' values

In [10]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 269258 entries, 0 to 270954
Data columns (total 18 columns):
Date                     269258 non-null datetime64[ns]
Store Number             269258 non-null int64
City                     269258 non-null object
Zip Code                 269258 non-null object
County Number            269258 non-null float64
County                   269258 non-null object
Category                 269258 non-null float64
Category Name            269258 non-null object
Vendor Number            269258 non-null int64
Item Number              269258 non-null int64
Item Description         269258 non-null object
Bottle Volume (ml)       269258 non-null int64
State Bottle Cost        269258 non-null float64
State Bottle Retail      269258 non-null float64
Bottles Sold             269258 non-null int64
Sale (Dollars)           269258 non-null float64
Volume Sold (Liters)     269258 non-null float64
Volume Sold (Gallons)    269258 non-null float64
dtypes: datetim

##### Convert category floats to integers

In [11]:
df["County Number"] = df["County Number"].astype(int)
df["Category"] = df["Category"].astype(int)
df.head(2)

Unnamed: 0,Date,Store Number,City,Zip Code,County Number,County,Category,Category Name,Vendor Number,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,2015-11-04,3717,SUMNER,50674,9,Bremer,1051100,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,4.5,6.75,12,81.0,9.0,2.38
1,2016-03-02,2614,DAVENPORT,52807,82,Scott,1011100,BLENDED WHISKIES,395,27605,Tin Cup,750,13.75,20.63,2,41.26,1.5,0.4


## Filter the Data

Some stores may have opened or closed in 2015. These data points will heavily skew our models, so we need to filter them out or find a way to deal with them.

You'll need to provide a summary in your project report about these data points. You may also consider using the monthly sales in your model and including other information (number of months or days each store is open) in your data to handle these unusual cases.

Let's record the first and last sales dates for each store. We'll save this information for later when we fit our models.

##### Determine which stores were open all of 2015 - finding the first and last sale date

In [12]:
dates = df.groupby(by=["Store Number"], as_index=False)
dates = dates.agg({"Date": [np.min, np.max]})

##### For formatting purposes, this strips the empty cell above the first row

In [13]:
dates.columns = [' '.join(col).strip() for col in dates.columns.values]
dates.head()

Unnamed: 0,Store Number,Date amin,Date amax
0,2106,2015-01-08,2016-03-31
1,2113,2015-01-07,2016-03-23
2,2130,2015-01-08,2016-03-31
3,2152,2015-01-08,2016-02-25
4,2178,2015-01-07,2016-03-30


##### This code filters out stores that opened or closed during 2015 using a mask

Stores that opened after March 1st and stores that closed before October 1 are to be considered partial year stores

In [14]:
lower_cutoff = pd.Timestamp("20150301")
upper_cutoff = pd.Timestamp("20151001")

mask = (dates['Date amin'] < lower_cutoff) & (dates['Date amax'] > upper_cutoff)

good_stores = dates[mask]["Store Number"]

good_stores_df = df[df["Store Number"].isin(good_stores)]
# saving this mask for later per John's suggestion

In [15]:
good_stores_df.head()

Unnamed: 0,Date,Store Number,City,Zip Code,County Number,County,Category,Category Name,Vendor Number,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,2015-11-04,3717,SUMNER,50674,9,Bremer,1051100,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,4.5,6.75,12,81.0,9.0,2.38
1,2016-03-02,2614,DAVENPORT,52807,82,Scott,1011100,BLENDED WHISKIES,395,27605,Tin Cup,750,13.75,20.63,2,41.26,1.5,0.4
2,2016-02-11,2106,CEDAR FALLS,50613,7,Black Hawk,1011200,STRAIGHT BOURBON WHISKIES,65,19067,Jim Beam,1000,12.59,18.89,24,453.36,24.0,6.34
3,2016-02-03,2501,AMES,50010,85,Story,1071100,AMERICAN COCKTAILS,395,59154,1800 Ultimate Margarita,1750,9.5,14.25,6,85.5,10.5,2.77
4,2015-08-18,3654,BELMOND,50421,99,Wright,1031080,VODKA 80 PROOF,297,35918,Five O'clock Vodka,1750,7.2,10.8,12,129.6,21.0,5.55


In [16]:
good_stores_df.shape

(256181, 18)

In [17]:
df.shape

(269258, 18)

## Computing New Columns and Tables

Since we're trying to predict sales and/or profits, we'll want to compute some intermediate data. There are a lot of ways to do this and good use of pandas is crucial. For example, for each transaction we may want to know:
* unit margin, retail cost minus bottle cost
* gross margin - gross margin * bottles sold
* price per bottle
* price per liter

We'll need to make a new dataframe that indexes quantities by store:
* sales per store for all of 2015
* sales per store for Q1 2015
* sales per store for Q1 2016
* total volumes sold
* mean transaction revenue, gross margin, price per bottle, price per liter, etc.
* average sales per day
* number of days open (count of sales per store)

Make sure to retain other variables that we'll want to use to build our models, such as zip code, county number, city, etc. We recommend that you spend some time thinking about the model you may want to fit and computing enough of the suggested quantities to give you a few options.

Bonus tasks:
* Restrict your attention to stores that were open for all of 2015 and Q1 2016. Stores that opened or closed in 2015 will introduce outliers into your data.
* For each transaction we have the item category. You may be able to determine the store type (primarily wine, liquor, all types of alcohol, etc.) by the most common transaction category for each store. This could be a useful categorical variable for modelling. 

##### Calculating Unit Margin

In [18]:
df["Unit_Margin"] = (df["State Bottle Retail"] - df["State Bottle Cost"])

##### Calculating Gross Margin

In [19]:
df["Gross_Margin"] = (df["State Bottle Retail"] - df["State Bottle Cost"]) * df["Bottles Sold"]

##### Revenue per Liter

In [20]:
df["Revenue_per_Liter"] = df["Sale (Dollars)"] / df["Volume Sold (Liters)"]

##### Gross Margin per Liter

In [21]:
df["Gross_Margin_per_Liter"] = ((df["State Bottle Retail"] - df["State Bottle Cost"]) 
                                * df["Bottles Sold"]) / df["Volume Sold (Liters)"]

Checking our DataFrame after new column additions

In [22]:
df.head(2)

Unnamed: 0,Date,Store Number,City,Zip Code,County Number,County,Category,Category Name,Vendor Number,Item Number,...,State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons),Unit_Margin,Gross_Margin,Revenue_per_Liter,Gross_Margin_per_Liter
0,2015-11-04,3717,SUMNER,50674,9,Bremer,1051100,APRICOT BRANDIES,55,54436,...,4.5,6.75,12,81.0,9.0,2.38,2.25,27.0,9.0,3.0
1,2016-03-02,2614,DAVENPORT,52807,82,Scott,1011100,BLENDED WHISKIES,395,27605,...,13.75,20.63,2,41.26,1.5,0.4,6.88,13.76,27.506667,9.173333


##### Calculating Sales per Store for 2015

Step 1 - Filtering by start and end dates (January 1, 2015 and December 31, 2015, respectively)

In [23]:
df.sort_values(by=["Store Number", "Date"], inplace=True)

start_date = pd.Timestamp("20150101")
end_date = pd.Timestamp("20151231")

sales_mask_2015 = (df['Date'] >= start_date) & (df['Date'] <= end_date)
sales_2015_df = df[sales_mask_2015]

Step 2 - Group by Store Number

In [24]:
sales_2015_df = sales_2015_df.groupby(by="Store Number", as_index=False)

Step 3 - Compute sums, means for key metrics and collapse the column indicies

In [25]:
sales_2015_df = sales_2015_df.agg({"Sale (Dollars)": [np.sum, np.mean],
                                   "Volume Sold (Liters)": [np.sum, np.mean],
                                   "Unit_Margin": np.mean,
                                   "Gross_Margin": [np.sum, np.mean],
                                   "Revenue_per_Liter": np.mean,
                                   "Gross_Margin_per_Liter": np.mean,
                                   "Zip Code": lambda x: x.iloc[0], # just extract once, should be the same
                                   "City": lambda x: x.iloc[0],
                                   "County Number": lambda x: x.iloc[0],
                                   "County": lambda x: x.iloc[0],
                                   "Category": lambda x: x.iloc[0],
                                   "Category Name": lambda x: x.iloc[0],
                                   "Vendor Number": lambda x: x.iloc[0],
                                   "Item Number": lambda x: x.iloc[0],
                                   "State Bottle Cost": np.mean,
                                   "State Bottle Retail": np.mean,
                                   "Bottles Sold": [np.sum, np.mean],
                                  })

sales_2015_df.columns = [' '.join(col).strip() for col in sales_2015_df.columns.values]

sales_2015_df.head(2)

Unnamed: 0,Store Number,Gross_Margin_per_Liter mean,City <lambda>,Sale (Dollars) sum,Sale (Dollars) mean,Revenue_per_Liter mean,Vendor Number <lambda>,Zip Code <lambda>,Unit_Margin mean,Category <lambda>,...,Gross_Margin mean,County Number <lambda>,Item Number <lambda>,Category Name <lambda>,State Bottle Cost mean,Volume Sold (Liters) sum,Volume Sold (Liters) mean,State Bottle Retail mean,Bottles Sold sum,Bottles Sold mean
0,2106,5.957151,CEDAR FALLS,146038.7,277.640114,17.844997,35,50613,5.160951,1062200,...,92.665779,7,43127,PUERTO RICO & VIRGIN ISLANDS RUM,10.298783,9719.85,18.478802,15.459734,10355,19.686312
1,2113,6.1739,GOWRIE,9310.22,63.33483,18.5077,260,50543,5.445102,1062310,...,21.149932,94,43334,SPICED RUM,10.870544,659.85,4.488776,16.315646,671,4.564626


In [26]:
sales_2015_df.columns

Index([u'Store Number', u'Gross_Margin_per_Liter mean', u'City <lambda>',
       u'Sale (Dollars) sum', u'Sale (Dollars) mean',
       u'Revenue_per_Liter mean', u'Vendor Number <lambda>',
       u'Zip Code <lambda>', u'Unit_Margin mean', u'Category <lambda>',
       u'County <lambda>', u'Gross_Margin sum', u'Gross_Margin mean',
       u'County Number <lambda>', u'Item Number <lambda>',
       u'Category Name <lambda>', u'State Bottle Cost mean',
       u'Volume Sold (Liters) sum', u'Volume Sold (Liters) mean',
       u'State Bottle Retail mean', u'Bottles Sold sum', u'Bottles Sold mean'],
      dtype='object')

In [27]:
sales_2015_df['Store Number'].count()

1372

In [28]:
sales_2015_df.shape

(1372, 22)

In [29]:
sales_2015_df = sales_2015_df.rename(index=str, columns={"Zip Code <lambda>": "Zip_Code",})

In [30]:
sales_2015_df.groupby(by='Zip_Code', as_index=False).agg({'Store Number': 'count'})

Unnamed: 0,Zip_Code,Store Number
0,50002,2
1,50003,3
2,50006,2
3,50009,9
4,50010,21
5,50014,4
6,50020,2
7,50021,12
8,50022,6
9,50023,9


In [31]:
sales_by_zip_df = sales_2015_df.groupby(by='Zip_Code', as_index=False).agg({'Store Number': 'count'})
sales_by_zip_df

Unnamed: 0,Zip_Code,Store Number
0,50002,2
1,50003,3
2,50006,2
3,50009,9
4,50010,21
5,50014,4
6,50020,2
7,50021,12
8,50022,6
9,50023,9


In [32]:
sales_by_zip_df.shape

(409, 2)

In [33]:
sales_by_zip_df.columns

Index([u'Zip_Code', u'Store Number'], dtype='object')

In [34]:
type(sales_by_zip_df)

pandas.core.frame.DataFrame

In [35]:
sales_by_zip_df['Store Number'].sum()

1372

##### Merging 2 dataframes of different size

In [36]:
# This WORKS!  Need another column - competitors = Stores in zip minus 1

# A.merge(B, left_on='lkey', right_on='rkey', how='inner')
test1_df = sales_2015_df.merge(sales_by_zip_df, left_on='Zip_Code', right_on='Zip_Code', how='inner')
test1_df

Unnamed: 0,Store Number_x,Gross_Margin_per_Liter mean,City <lambda>,Sale (Dollars) sum,Sale (Dollars) mean,Revenue_per_Liter mean,Vendor Number <lambda>,Zip_Code,Unit_Margin mean,Category <lambda>,...,County Number <lambda>,Item Number <lambda>,Category Name <lambda>,State Bottle Cost mean,Volume Sold (Liters) sum,Volume Sold (Liters) mean,State Bottle Retail mean,Bottles Sold sum,Bottles Sold mean,Store Number_y
0,2106,5.957151,CEDAR FALLS,146038.70,277.640114,17.844997,35,50613,5.160951,1062200,...,7,43127,PUERTO RICO & VIRGIN ISLANDS RUM,10.298783,9719.85,18.478802,15.459734,10355,19.686312,16
1,2572,6.285143,CEDAR FALLS,98831.43,75.965742,18.800332,410,50613,5.007233,1082900,...,7,64736,MISC. IMPORTED CORDIALS & LIQUEURS,9.967925,6080.74,4.673897,14.975158,6549,5.033820,16
2,2835,4.842733,CEDAR FALLS,11751.32,102.185391,14.460961,434,50613,5.394435,1041100,...,7,31658,AMERICAN DRY GINS,10.714000,991.50,8.621739,16.108435,790,6.869565,16
3,3769,4.508190,CEDAR FALLS,55042.92,174.739429,13.478561,259,50613,4.978190,1062310,...,7,43026,SPICED RUM,9.897619,4584.00,14.552381,14.875810,3936,12.495238,16
4,3995,6.029937,CEDAR FALLS,43059.99,219.693827,18.072485,260,50613,5.384694,1031200,...,7,77648,VODKA FLAVORED,10.755816,3022.00,15.418367,16.140510,3242,16.540816,16
5,4119,6.532637,CEDAR FALLS,7169.42,183.831282,19.591941,434,50613,5.784103,1031080,...,7,36308,VODKA 80 PROOF,11.563077,403.00,10.333333,17.347179,424,10.871795,16
6,4147,5.051960,CEDAR FALLS,32616.23,221.879116,15.117512,85,50613,4.858163,1011300,...,7,86670,TENNESSEE WHISKIES,9.663605,3340.50,22.724490,14.521769,2342,15.931973,16
7,4590,4.474135,CEDAR FALLS,2834.86,109.033077,13.305957,434,50613,3.552308,1031080,...,7,36308,VODKA 80 PROOF,6.978077,230.40,8.861538,10.530385,284,10.923077,16
8,4631,4.973834,CEDAR FALLS,3995.99,102.461282,14.880061,421,50613,4.793077,1081600,...,7,64858,WHISKEY LIQUEUR,9.547692,362.05,9.283333,14.340769,403,10.333333,16
9,4723,4.848254,CEDAR FALLS,1405.32,156.146667,14.489312,115,50613,4.835556,1012100,...,7,11788,CANADIAN WHISKIES,9.580000,105.00,11.666667,14.415556,108,12.000000,16


In [37]:
test1_df.shape

(1372, 23)

### Groupbys and Pivot Tables

In [38]:
# old_names = ['$a', '$b', '$c', '$d', '$e'] 
# new_names = ['a', 'b', 'c', 'd', 'e']
# df.rename(columns=dict(zip(old_names, new_names)), inplace=True)


In [39]:
test1_df.columns

Index([u'Store Number_x', u'Gross_Margin_per_Liter mean', u'City <lambda>',
       u'Sale (Dollars) sum', u'Sale (Dollars) mean',
       u'Revenue_per_Liter mean', u'Vendor Number <lambda>', u'Zip_Code',
       u'Unit_Margin mean', u'Category <lambda>', u'County <lambda>',
       u'Gross_Margin sum', u'Gross_Margin mean', u'County Number <lambda>',
       u'Item Number <lambda>', u'Category Name <lambda>',
       u'State Bottle Cost mean', u'Volume Sold (Liters) sum',
       u'Volume Sold (Liters) mean', u'State Bottle Retail mean',
       u'Bottles Sold sum', u'Bottles Sold mean', u'Store Number_y'],
      dtype='object')

In [40]:
test1_df.rename(columns=lambda x: x.lstrip(), inplace=True)
test1_df.head(2)

Unnamed: 0,Store Number_x,Gross_Margin_per_Liter mean,City <lambda>,Sale (Dollars) sum,Sale (Dollars) mean,Revenue_per_Liter mean,Vendor Number <lambda>,Zip_Code,Unit_Margin mean,Category <lambda>,...,County Number <lambda>,Item Number <lambda>,Category Name <lambda>,State Bottle Cost mean,Volume Sold (Liters) sum,Volume Sold (Liters) mean,State Bottle Retail mean,Bottles Sold sum,Bottles Sold mean,Store Number_y
0,2106,5.957151,CEDAR FALLS,146038.7,277.640114,17.844997,35,50613,5.160951,1062200,...,7,43127,PUERTO RICO & VIRGIN ISLANDS RUM,10.298783,9719.85,18.478802,15.459734,10355,19.686312,16
1,2572,6.285143,CEDAR FALLS,98831.43,75.965742,18.800332,410,50613,5.007233,1082900,...,7,64736,MISC. IMPORTED CORDIALS & LIQUEURS,9.967925,6080.74,4.673897,14.975158,6549,5.03382,16


In [41]:
test1_df.columns

Index([u'Store Number_x', u'Gross_Margin_per_Liter mean', u'City <lambda>',
       u'Sale (Dollars) sum', u'Sale (Dollars) mean',
       u'Revenue_per_Liter mean', u'Vendor Number <lambda>', u'Zip_Code',
       u'Unit_Margin mean', u'Category <lambda>', u'County <lambda>',
       u'Gross_Margin sum', u'Gross_Margin mean', u'County Number <lambda>',
       u'Item Number <lambda>', u'Category Name <lambda>',
       u'State Bottle Cost mean', u'Volume Sold (Liters) sum',
       u'Volume Sold (Liters) mean', u'State Bottle Retail mean',
       u'Bottles Sold sum', u'Bottles Sold mean', u'Store Number_y'],
      dtype='object')

In [42]:
test1_df.columns = test1_df.columns.str.replace(' <lambda>', '')
test1_df.columns

Index([u'Store Number_x', u'Gross_Margin_per_Liter mean', u'City',
       u'Sale (Dollars) sum', u'Sale (Dollars) mean',
       u'Revenue_per_Liter mean', u'Vendor Number', u'Zip_Code',
       u'Unit_Margin mean', u'Category', u'County', u'Gross_Margin sum',
       u'Gross_Margin mean', u'County Number', u'Item Number',
       u'Category Name', u'State Bottle Cost mean',
       u'Volume Sold (Liters) sum', u'Volume Sold (Liters) mean',
       u'State Bottle Retail mean', u'Bottles Sold sum', u'Bottles Sold mean',
       u'Store Number_y'],
      dtype='object')

In [43]:
test1_df.head(2)

Unnamed: 0,Store Number_x,Gross_Margin_per_Liter mean,City,Sale (Dollars) sum,Sale (Dollars) mean,Revenue_per_Liter mean,Vendor Number,Zip_Code,Unit_Margin mean,Category,...,County Number,Item Number,Category Name,State Bottle Cost mean,Volume Sold (Liters) sum,Volume Sold (Liters) mean,State Bottle Retail mean,Bottles Sold sum,Bottles Sold mean,Store Number_y
0,2106,5.957151,CEDAR FALLS,146038.7,277.640114,17.844997,35,50613,5.160951,1062200,...,7,43127,PUERTO RICO & VIRGIN ISLANDS RUM,10.298783,9719.85,18.478802,15.459734,10355,19.686312,16
1,2572,6.285143,CEDAR FALLS,98831.43,75.965742,18.800332,410,50613,5.007233,1082900,...,7,64736,MISC. IMPORTED CORDIALS & LIQUEURS,9.967925,6080.74,4.673897,14.975158,6549,5.03382,16


In [44]:
test1_df = test1_df.rename(index=str, columns={"Store Number_y": "Competitors_per_Zip",})
test1_df.head(2)

Unnamed: 0,Store Number_x,Gross_Margin_per_Liter mean,City,Sale (Dollars) sum,Sale (Dollars) mean,Revenue_per_Liter mean,Vendor Number,Zip_Code,Unit_Margin mean,Category,...,County Number,Item Number,Category Name,State Bottle Cost mean,Volume Sold (Liters) sum,Volume Sold (Liters) mean,State Bottle Retail mean,Bottles Sold sum,Bottles Sold mean,Competitors_per_Zip
0,2106,5.957151,CEDAR FALLS,146038.7,277.640114,17.844997,35,50613,5.160951,1062200,...,7,43127,PUERTO RICO & VIRGIN ISLANDS RUM,10.298783,9719.85,18.478802,15.459734,10355,19.686312,16
1,2572,6.285143,CEDAR FALLS,98831.43,75.965742,18.800332,410,50613,5.007233,1082900,...,7,64736,MISC. IMPORTED CORDIALS & LIQUEURS,9.967925,6080.74,4.673897,14.975158,6549,5.03382,16


In [45]:
test1_df['Competitors_per_Zip'] = (test1_df['Competitors_per_Zip'] - 1)

In [46]:
test1_df.head(2)

Unnamed: 0,Store Number_x,Gross_Margin_per_Liter mean,City,Sale (Dollars) sum,Sale (Dollars) mean,Revenue_per_Liter mean,Vendor Number,Zip_Code,Unit_Margin mean,Category,...,County Number,Item Number,Category Name,State Bottle Cost mean,Volume Sold (Liters) sum,Volume Sold (Liters) mean,State Bottle Retail mean,Bottles Sold sum,Bottles Sold mean,Competitors_per_Zip
0,2106,5.957151,CEDAR FALLS,146038.7,277.640114,17.844997,35,50613,5.160951,1062200,...,7,43127,PUERTO RICO & VIRGIN ISLANDS RUM,10.298783,9719.85,18.478802,15.459734,10355,19.686312,15
1,2572,6.285143,CEDAR FALLS,98831.43,75.965742,18.800332,410,50613,5.007233,1082900,...,7,64736,MISC. IMPORTED CORDIALS & LIQUEURS,9.967925,6080.74,4.673897,14.975158,6549,5.03382,15


In [47]:
test1_df.columns

Index([u'Store Number_x', u'Gross_Margin_per_Liter mean', u'City',
       u'Sale (Dollars) sum', u'Sale (Dollars) mean',
       u'Revenue_per_Liter mean', u'Vendor Number', u'Zip_Code',
       u'Unit_Margin mean', u'Category', u'County', u'Gross_Margin sum',
       u'Gross_Margin mean', u'County Number', u'Item Number',
       u'Category Name', u'State Bottle Cost mean',
       u'Volume Sold (Liters) sum', u'Volume Sold (Liters) mean',
       u'State Bottle Retail mean', u'Bottles Sold sum', u'Bottles Sold mean',
       u'Competitors_per_Zip'],
      dtype='object')

##### This allows you display all of the columns

In [49]:
pd.set_option("display.max_columns",999)


In [50]:
test1_df.head(2)

Unnamed: 0,Store Number_x,Gross_Margin_per_Liter mean,City,Sale (Dollars) sum,Sale (Dollars) mean,Revenue_per_Liter mean,Vendor Number,Zip_Code,Unit_Margin mean,Category,County,Gross_Margin sum,Gross_Margin mean,County Number,Item Number,Category Name,State Bottle Cost mean,Volume Sold (Liters) sum,Volume Sold (Liters) mean,State Bottle Retail mean,Bottles Sold sum,Bottles Sold mean,Competitors_per_Zip
0,2106,5.957151,CEDAR FALLS,146038.7,277.640114,17.844997,35,50613,5.160951,1062200,Black Hawk,48742.2,92.665779,7,43127,PUERTO RICO & VIRGIN ISLANDS RUM,10.298783,9719.85,18.478802,15.459734,10355,19.686312,15
1,2572,6.285143,CEDAR FALLS,98831.43,75.965742,18.800332,410,50613,5.007233,1082900,Black Hawk,33027.37,25.386141,7,64736,MISC. IMPORTED CORDIALS & LIQUEURS,9.967925,6080.74,4.673897,14.975158,6549,5.03382,15


### Groupby and Pivot Tables

In [58]:
# Need to clean up column names

In [53]:
zip_sum1_df = test1_df.groupby(by='Zip_Code', 
                               as_index=False).agg({'Volume Sold (Liters) sum': np.sum, #zip vols
                                                    'Volume Sold (Liters) mean': np.mean, #avg store vol
                                                    'Bottles Sold sum': np.sum, #total bottles for zip
                                                    'Bottles Sold mean': np.mean, #bottles/store
        
                                                    'Sale (Dollars) sum': np.sum, #gross sales for zip
                                                    'Sale (Dollars) mean': np.mean, #sales per store
                                                    'State Bottle Retail mean': np.mean, #avg sales price
                                                    'Revenue_per_Liter mean': np.mean, #avg rev per liter
                                                                   
                                                    'Gross_Margin sum': np.sum, #Total Gross Margin $s in zip
                                                    'Gross_Margin mean': np.mean, #Average GM per store
                                                    'Gross_Margin_per_Liter mean': np.mean, #Avg GM per liter
                                                    'Unit_Margin mean': np.mean, #Average Unit Margin (bottle)
                                                    })

zip_sum1_df

Unnamed: 0,Zip_Code,Unit_Margin mean,Volume Sold (Liters) mean,Gross_Margin_per_Liter mean,Bottles Sold sum,Gross_Margin sum,Sale (Dollars) mean,Bottles Sold mean,Revenue_per_Liter mean,State Bottle Retail mean,Sale (Dollars) sum,Volume Sold (Liters) sum,Gross_Margin mean
0,50002,3.799471,8.950962,5.468338,498,1663.48,129.295865,13.971154,16.334449,11.311010,4958.04,335.85,43.327853
1,50003,3.491705,10.458894,5.010616,2918,12734.43,136.528786,14.241261,15.009619,10.447933,38095.16,2737.52,45.606403
2,50006,4.715833,10.957363,5.102517,640,3369.97,137.555330,13.791209,15.282394,14.121474,10086.97,719.24,45.921813
3,50009,5.212057,10.726930,6.251109,17953,80566.24,164.469586,11.455541,18.717742,15.599463,241028.59,16284.00,54.957015
4,50010,4.838088,10.015967,6.010113,53617,258178.25,138.746251,10.391464,17.998099,14.482519,772044.25,52520.63,46.394115
5,50014,5.672069,7.601680,6.430975,3083,13423.74,110.536308,8.165572,19.257055,16.988671,40206.10,2871.26,36.897423
6,50020,4.100506,7.779466,5.055149,704,2907.57,87.607652,9.740829,15.157084,12.294948,8716.27,686.60,29.225108
7,50021,4.887499,12.504997,5.729734,26546,140000.28,185.192100,12.252743,17.149486,14.617388,417794.83,28782.22,62.020481
8,50022,4.619031,10.235515,5.358665,6849,31164.05,120.998894,10.427229,16.053641,13.830538,93176.80,7199.92,40.544068
9,50023,4.775067,6.403562,5.840266,12093,60094.78,86.292030,7.263944,17.484060,14.287802,179657.93,11379.28,28.872319


In [57]:
zip_economics = zip_sum1_df.describe()
zip_economics

Unnamed: 0,Unit_Margin mean,Volume Sold (Liters) mean,Gross_Margin_per_Liter mean,Bottles Sold sum,Gross_Margin sum,Sale (Dollars) mean,Bottles Sold mean,Revenue_per_Liter mean,State Bottle Retail mean,Sale (Dollars) sum,Volume Sold (Liters) sum,Gross_Margin mean
count,409.0,409.0,409.0,409.0,409.0,409.0,409.0,409.0,409.0,409.0,409.0,409.0
mean,4.478848,8.444306,5.475616,5312.616137,23285.925697,111.474184,9.845367,16.391895,13.398114,69629.98,4851.157335,37.312706
std,0.847453,3.865844,1.078871,10692.295033,48869.945345,54.777782,5.116654,3.240266,2.543043,146268.1,9605.821665,18.340656
min,1.515,1.283333,3.173605,6.0,30.0,24.911009,2.288184,9.474014,4.54,90.0,4.5,8.325418
25%,4.076296,6.189922,4.947954,362.0,1362.78,79.899147,6.849671,14.806887,12.15302,4086.72,326.36,26.774648
50%,4.496012,7.877163,5.3898,994.0,3959.95,104.79959,9.169748,16.131523,13.462229,11825.21,964.55,34.990641
75%,4.866614,10.015967,5.887173,5054.0,21699.11,129.440076,11.490196,17.630704,14.558562,64992.27,4932.29,43.346382
max,13.850294,42.75,18.43541,75492.0,356316.11,554.37,57.0,55.282316,41.53076,1068201.0,69711.67,184.95
