# Getting started

Once you've chosen your scenario, download the data from [the Iowa website](https://data.iowa.gov/Economy/Iowa-Liquor-Sales/m3tr-qhgy) in csv format. Start by loading the data with pandas. You may need to parse the date columns appropriately.

In [1]:
# Loading initial libraries, starting with data.table and lubridate for easy cleaning
library(data.table)
library(lubridate)
library(ggplot2)
# reading in the 10 pct CSV and converting it to a data frame (possibly redundant)
liquor = read.csv('../datasets/Iowa_Liquor_sales_sample_10pct.csv')
liquor_data = data.table(liquor)
#Printing out the head
head(liquor_data)


Attaching package: ‘lubridate’

The following objects are masked from ‘package:data.table’:

    hour, mday, month, quarter, wday, week, yday, year



Unnamed: 0,Date,Store.Number,City,Zip.Code,County.Number,County,Category,Category.Name,Vendor.Number,Item.Number,Item.Description,Bottle.Volume..ml.,State.Bottle.Cost,State.Bottle.Retail,Bottles.Sold,Sale..Dollars.,Volume.Sold..Liters.,Volume.Sold..Gallons.
1,11/04/2015,3717,SUMNER,50674,9,Bremer,1051100,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,$4.50,$6.75,12,$81.00,9.0,2.38
2,03/02/2016,2614,DAVENPORT,52807,82,Scott,1011100,BLENDED WHISKIES,395,27605,Tin Cup,750,$13.75,$20.63,2,$41.26,1.5,0.4
3,02/11/2016,2106,CEDAR FALLS,50613,7,Black Hawk,1011200,STRAIGHT BOURBON WHISKIES,65,19067,Jim Beam,1000,$12.59,$18.89,24,$453.36,24.0,6.34
4,02/03/2016,2501,AMES,50010,85,Story,1071100,AMERICAN COCKTAILS,395,59154,1800 Ultimate Margarita,1750,$9.50,$14.25,6,$85.50,10.5,2.77
5,08/18/2015,3654,BELMOND,50421,99,Wright,1031080,VODKA 80 PROOF,297,35918,Five O'clock Vodka,1750,$7.20,$10.80,12,$129.60,21.0,5.55
6,04/20/2015,2569,CEDAR RAPIDS,52402,57,Linn,1041100,AMERICAN DRY GINS,205,31473,New Amsterdam Gin,1750,$13.32,$19.98,6,$119.88,10.5,2.77


In [2]:
#Removing the Gallons Sold column because it's redundant. Hard liquor is measured in Liters.
liquor_data$Volume.Sold..Gallons. = NULL

In [3]:
# Getting the list of names from the data frame
names(liquor_data)

In [4]:
#Renaming some of the columns. Leaving the dots as spaces for ease of use later.
names(liquor_data) = c('Date', 'Store.Number', 'City', 'Zip.Code', 'County.Number','County', 'Category','Category.Name', 'Vendor.Number',
                       'Item.Number', 'Item.Description', 'Bottle.Volume(ml)','State.Bottle.Cost',
                       'State.Bottle.Retail', 'Bottles.Sold','Sale.Dollars','Volume.Sold(Liters)')
#Reprint names
names(liquor_data)

In [5]:
#printing out the structure of the data frame
str(liquor_data)

Classes ‘data.table’ and 'data.frame':	270955 obs. of  17 variables:
 $ Date               : Factor w/ 274 levels "01/04/2016","01/05/2015",..: 237 69 48 38 189 116 181 157 1 240 ...
 $ Store.Number       : int  3717 2614 2106 2501 3654 2569 2596 3456 4757 4346 ...
 $ City               : Factor w/ 385 levels "ACKLEY","ADAIR",..: 337 82 52 12 31 53 272 63 36 313 ...
 $ Zip.Code           : Factor w/ 415 levels "50002","50003",..: 179 413 154 5 104 361 366 107 14 344 ...
 $ County.Number      : num  9 82 7 85 99 57 90 17 77 6 ...
 $ County             : Factor w/ 100 levels "","Adair","Adams",..: 10 83 8 86 100 58 91 18 78 7 ...
 $ Category           : num  1051100 1011100 1011200 1071100 1031080 ...
 $ Category.Name      : Factor w/ 72 levels "","100 PROOF VODKA",..: 12 15 60 6 67 7 8 18 37 20 ...
 $ Vendor.Number      : int  55 395 65 395 297 205 85 65 370 65 ...
 $ Item.Number        : int  54436 27605 19067 59154 35918 31473 52806 10628 34006 82610 ...
 $ Item.Description   : Factor

In [6]:
#Converting Date column to Date Time using lubridate
liquor_data$Date = mdy(liquor_data$Date)

In [None]:
head(liquor_data)

In [7]:
#Removing '$' from the money columns and changing them to numerics

liquor_data$State.Bottle.Cost = as.numeric(sub('\\$','',as.character(liquor_data$State.Bottle.Cost)))
liquor_data$Sale.Dollars = as.numeric(sub('\\$','',as.character(liquor_data$Sale.Dollars)))
liquor_data$State.Bottle.Retail = as.numeric(sub('\\$','',as.character(liquor_data$State.Bottle.Retail)))

#rechecking the structure of the data frame
# summary(liquor_data)

In [9]:
liquor_data$Net.Bottle.Sale = (liquor_data$State.Bottle.Retail - liquor_data$State.Bottle.Cost)
head(liquor_data)

Unnamed: 0,Date,Store.Number,City,Zip.Code,County.Number,County,Category,Category.Name,Vendor.Number,Item.Number,Item.Description,Bottle.Volume(ml),State.Bottle.Cost,State.Bottle.Retail,Bottles.Sold,Sale.Dollars,Volume.Sold(Liters),Net.Bottle.Sale
1,2015-11-04,3717,SUMNER,50674,9,Bremer,1051100,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,4.5,6.75,12,81.0,9.0,2.25
2,2016-03-02,2614,DAVENPORT,52807,82,Scott,1011100,BLENDED WHISKIES,395,27605,Tin Cup,750,13.75,20.63,2,41.26,1.5,6.88
3,2016-02-11,2106,CEDAR FALLS,50613,7,Black Hawk,1011200,STRAIGHT BOURBON WHISKIES,65,19067,Jim Beam,1000,12.59,18.89,24,453.36,24.0,6.3
4,2016-02-03,2501,AMES,50010,85,Story,1071100,AMERICAN COCKTAILS,395,59154,1800 Ultimate Margarita,1750,9.5,14.25,6,85.5,10.5,4.75
5,2015-08-18,3654,BELMOND,50421,99,Wright,1031080,VODKA 80 PROOF,297,35918,Five O'clock Vodka,1750,7.2,10.8,12,129.6,21.0,3.6
6,2015-04-20,2569,CEDAR RAPIDS,52402,57,Linn,1041100,AMERICAN DRY GINS,205,31473,New Amsterdam Gin,1750,13.32,19.98,6,119.88,10.5,6.66


In [10]:
liquor_data$Net.Total.Profit = (liquor_data$Net.Bottle.Sale * liquor_data$Bottles.Sold)
head(liquor_data)

Unnamed: 0,Date,Store.Number,City,Zip.Code,County.Number,County,Category,Category.Name,Vendor.Number,Item.Number,Item.Description,Bottle.Volume(ml),State.Bottle.Cost,State.Bottle.Retail,Bottles.Sold,Sale.Dollars,Volume.Sold(Liters),Net.Bottle.Sale,Net.Total.Profit
1,2015-11-04,3717,SUMNER,50674,9,Bremer,1051100,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,4.5,6.75,12,81.0,9.0,2.25,27.0
2,2016-03-02,2614,DAVENPORT,52807,82,Scott,1011100,BLENDED WHISKIES,395,27605,Tin Cup,750,13.75,20.63,2,41.26,1.5,6.88,13.76
3,2016-02-11,2106,CEDAR FALLS,50613,7,Black Hawk,1011200,STRAIGHT BOURBON WHISKIES,65,19067,Jim Beam,1000,12.59,18.89,24,453.36,24.0,6.3,151.2
4,2016-02-03,2501,AMES,50010,85,Story,1071100,AMERICAN COCKTAILS,395,59154,1800 Ultimate Margarita,1750,9.5,14.25,6,85.5,10.5,4.75,28.5
5,2015-08-18,3654,BELMOND,50421,99,Wright,1031080,VODKA 80 PROOF,297,35918,Five O'clock Vodka,1750,7.2,10.8,12,129.6,21.0,3.6,43.2
6,2015-04-20,2569,CEDAR RAPIDS,52402,57,Linn,1041100,AMERICAN DRY GINS,205,31473,New Amsterdam Gin,1750,13.32,19.98,6,119.88,10.5,6.66,39.96


In [None]:
#creating a new data frame with sales by date (Maybe not useful for this project, but still fun)

# datesales = liquor_data[,.(Sale.Dollars.Sum = sum(Sale.Dollars)),by=Date]
# datesales

countysales = liquor_data[,.(Sale.Dollars.Mean = mean(Sale.Dollars)),by=County]
countysales
countynames = c(countysales$county)

# Explore the data

Perform some exploratory statistical analysis and make some plots, such as histograms of transaction totals, bottles sold, etc.

In [21]:
mask = liquor_data$Sale.Dollars < 500

In [26]:
with(liquor_data[(mask)], ggplot(liquor_data$Sale.Dollars, aes(x=Sale.Dollars))+ geom_histogram(data=liquor_data, stat_bin=50))

ERROR: Error: ggplot2 doesn't know how to deal with data of class numeric


In [41]:
Sales = liquor_data[,.(Net.Total.Profit.Sum = sum(Net.Total.Profit)), (Bottles.Sold.Sum = sum(Bottles.Sold)),
                    by=c('County', 'Category.Name')]

ERROR: Error in `[.data.table`(liquor_data, , .(Net.Total.Profit.Sum = sum(Net.Total.Profit)), : Provide either 'by' or 'keyby' but not both


In [42]:
SalesY = liquor_data[,.(Net.Total.Profit.Sum = sum(Net.Total.Profit)),by=c('County', 'Category.Name')]
SalesY

ERROR: Error in `[.data.table`(liquor_data, , .(Net.Total.Profit.Sum = sum(Net.Total.Profit)), : Provide either 'by' or 'keyby' but not both


Unnamed: 0,County,Category.Name,Net.Total.Profit.Sum
1,Bremer,APRICOT BRANDIES,461.75
2,Scott,BLENDED WHISKIES,37467.82
3,Black Hawk,STRAIGHT BOURBON WHISKIES,29745.8
4,Story,AMERICAN COCKTAILS,5363.7
5,Wright,VODKA 80 PROOF,3922.99
6,Linn,AMERICAN DRY GINS,15367.24
7,Wapello,AMERICAN GRAPE BRANDIES,1046.61
8,Cerro Gordo,CANADIAN WHISKIES,39764.44
9,Polk,IMPORTED VODKA,181342.3
10,Benton,CINNAMON SCHNAPPS,129.59


In [40]:
SalesZ = liquor_data[,.(Bottles.Sold.Sum = sum(Bottles.Sold)),by=c('County','Category.Name')]
SalesZ

Unnamed: 0,County,Category.Name,Bottles.Sold.Sum
1,Bremer,APRICOT BRANDIES,166
2,Scott,BLENDED WHISKIES,12851
3,Black Hawk,STRAIGHT BOURBON WHISKIES,5105
4,Story,AMERICAN COCKTAILS,1492
5,Wright,VODKA 80 PROOF,1477
6,Linn,AMERICAN DRY GINS,5913
7,Wapello,AMERICAN GRAPE BRANDIES,432
8,Cerro Gordo,CANADIAN WHISKIES,9502
9,Polk,IMPORTED VODKA,26122
10,Benton,CINNAMON SCHNAPPS,28


In [46]:
merge(SalesY, SalesZ, by=c('County', 'Category.Name'))

Unnamed: 0,County,Category.Name,Net.Total.Profit.Sum,Bottles.Sold.Sum
1,,,773.97,144
2,,100 PROOF VODKA,66.36,54
3,,AMERICAN ALCOHOL,12.92,4
4,,AMERICAN AMARETTO,56.81,25
5,,AMERICAN COCKTAILS,858.28,240
6,,AMERICAN DRY GINS,439.63,278
7,,AMERICAN GRAPE BRANDIES,809.39,532
8,,AMERICAN SLOE GINS,32.52,12
9,,APPLE SCHNAPPS,81.9,18
10,,APRICOT BRANDIES,85.68,42


In [None]:
regplot(liquor_data, xlab = Net.Total.Profit, ylab = Net.Sales, poly = 1, position = 6, colors = TRUE, 
mean = TRUE, variable = 1, x.plateau = NULL)

In [None]:
#plot(fit)

In [None]:
# library(ggplot2)
# geom_bar(data=countysales, stat='identity')

In [None]:
liquor_num_cols = sapply(liquor_data, is.numeric)
liquor_num = liquor_data[, liquor_num_cols, with=F]

In [None]:
head(liquor_num)

In [None]:
cor(liquor_num, use='pairwise.complete.obs')

In [None]:
tmp = liquor_num[, retail_per_liter := max(State.Bottle.Retail/(get('Bottle.Volume(ml)')/1000)), by='Vendor.Number']

In [None]:
head(tmp)


In [None]:
liquor_data$multibottle_binary = NULL
head(liquor_data)

In [None]:
model = lm('Bottles.Sold ~ Category.Name', data= liquor_data)
summary(model)