In [1]:
import pandas as pd
import numpy as np

brewery_2022_interpolated = pd.read_csv("Data/brewery_production_2022_interpolated.csv")
brewery_2017_2019 = pd.read_csv("Data/brewery_production_2017_2019.csv")

In [67]:
brewery_2022_interpolated.columns

Index(['brewery', 'brewery_type', 'address', 'city', 'zipcode', 'county',
       'latitude', 'longitude', 'year_established', 'guild_member',
       'closed_since_2022', 'untappd_profile_link', 'January', 'February',
       'March', 'April', 'May', 'June', 'July', 'August', 'September',
       'October', 'November', 'December', 'total_annual'],
      dtype='object')

In [69]:
brewery_2022_total = pd.DataFrame({
    "brewery_name": brewery_2022_interpolated["brewery"], 
    "annual_production": brewery_2022_interpolated["total_annual"],
    "year": 2022,
    "estimate": 1
})


In [71]:
brewery_2017_2022 = pd.concat([brewery_2022_total, brewery_2017_2019])

In [72]:
def num_handler(s):
    if s == "DNP" or s == "Do Not Publish":
        return np.nan
    if type(s) == str:
        s = s.replace(",", "")
    return float(s)

In [89]:
brewery_2017_2022

Unnamed: 0,brewery_name,annual_production,year,estimate
0,192 Brewing Company,,2022.0,1.0
1,20 Corners Brewing Company,1481.65,2022.0,1.0
2,23rd Ave Brewery,,2022.0,1.0
3,4 Stitch Brewing Company,,2022.0,1.0
4,45 Degree Brewhouse,87.07,2022.0,1.0
...,...,...,...,...
1224,Willapa Brewing Co,155,2017.0,1.0
1225,Wingman Brewers,876,2017.0,1.0
1226,Yakima Craft Brewing Company,1500,2017.0,0.0
1227,Yakima Valley Hops,5,2017.0,1.0


In [178]:
brewery_production_trend = pd.DataFrame(brewery_2017_2022.groupby("brewery_name")["annual_production"].agg(lambda x: np.nan if len(x) == 1 else np.polyfit(np.arange(len(x)), np.array(list(map(lambda v: num_handler(v) ,x.values[::-1]))), 1)[0] / num_handler(x.iloc[-1])))
brewery_production_trend.rename(columns={"annual_production": "production_trend"}, inplace=True)
brewery_production_size = pd.DataFrame(brewery_2017_2022.groupby("brewery_name")["annual_production"].agg(lambda x: np.sum(list(map(lambda v: num_handler(v) ,x.values)))/len(x)))
# categorize brewery production by percentile
brewery_production_size["annual_production"] = pd.qcut(brewery_production_size["annual_production"], 10, labels=False)
brewery_production_size.rename(columns={"annual_production": "production_size"}, inplace=True)
brewery_production_stat = pd.concat([brewery_production_trend, brewery_production_size], axis=1)

In [179]:
rawDF = pd.merge(brewery_2022_interpolated, brewery_production_stat, left_on="brewery", right_on="brewery_name", how="inner")

In [180]:
dataDF = rawDF[["brewery", 'brewery_type', 'city', 'county', 'latitude', 'longitude',
       'year_established', 'guild_member',
       'closed_since_2022', "production_trend", "production_size"]]
dataDF['year_established'] = dataDF['year_established'].apply(lambda x: 2022 - x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataDF['year_established'] = dataDF['year_established'].apply(lambda x: 2022 - x)


In [181]:
dataDF = pd.get_dummies(dataDF[['brewery_type', 'latitude', 'longitude',
       'year_established', 'guild_member',
       'closed_since_2022', "production_trend", "production_size"]])
dataDF.dropna(inplace=True)

In [182]:
dataDF.head()

Unnamed: 0,latitude,longitude,year_established,guild_member,closed_since_2022,production_trend,production_size,brewery_type_Micro,brewery_type_alt prop,brewery_type_brewpub,brewery_type_contract brewer,brewery_type_large,brewery_type_micro,brewery_type_nano,brewery_type_planning,brewery_type_regional,brewery_type_taproom
1,47.765067,-122.151015,6,0,0,0.052361,8.0,0,0,1,0,0,0,0,0,0,0
5,47.717587,-117.43285,2,0,1,0.182832,5.0,0,0,0,0,0,0,0,0,0,1
7,45.564832,-122.327074,7,0,0,0.259029,9.0,0,0,1,0,0,0,0,0,0,0
9,47.242638,-122.439135,13,1,0,-0.110804,9.0,0,0,0,0,0,0,0,0,0,1
10,47.237965,-122.293459,1,1,0,0.229702,5.0,0,0,0,0,0,0,0,0,0,1


In [183]:
dataDF.shape

(266, 17)

In [184]:
X = pd.get_dummies(dataDF.drop(columns=["closed_since_2022"])).values
y = dataDF["closed_since_2022"].values

In [185]:
dataDF["closed_since_2022"].value_counts()

0    254
1     12
Name: closed_since_2022, dtype: int64

In [191]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.decomposition import PCA

pca = PCA(n_components=int(np.sqrt(X.shape[1])))
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)


In [192]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import sklearn.model_selection as skm
from ISLP import load_data, confusion_table
from sklearn.svm import SVC

# fit a support vector classifier on the training set
svm = SVC(kernel="rbf")
kfold = skm.KFold(5, 
                  random_state=0,
                  shuffle=True)
grid = skm.GridSearchCV(svm,
                        {'C':[0.01,0.1,1,5,10,100], "gamma": [0.001, 0.01,0.1,1,5,10, 100], "kernel":["linear", "rbf"], "class_weight":["balanced"]},
                        refit=True,
                        cv=kfold,
                        scoring='accuracy')
grid.fit(X_train, y_train)
print("grid best params:", grid.best_params_)

best_ = grid.best_estimator_
# train error rate with best model
y_train_hat = best_.predict(X_train)
print("best estimator train error:", 1 - accuracy_score(y_train, y_train_hat))

# test error rate with best model
y_test_hat = best_.predict(X_test)
print("best estimator test error:", 1 - accuracy_score(y_test, y_test_hat))

grid best params: {'C': 1, 'class_weight': 'balanced', 'gamma': 100, 'kernel': 'rbf'}
best estimator train error: 0.0
best estimator test error: 0.01851851851851849


In [193]:
np.unique(y_train_hat)

array([0, 1], dtype=int64)

In [194]:
confusion_table(y_test_hat, y_test)

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,53,1
1,0,0


In [195]:
confusion_table(y_train_hat, y_train)

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,201,0
1,0,11
