In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
df = pd.read_csv('ticket_data.csv').drop(columns=['Unnamed: 0'])

In [3]:
pop = pd.read_csv('zipcode_population.csv',dtype='str').drop(columns=['Unnamed: 0'])

In [4]:
df.columns

Index(['datetime_local', 'is_ga', 'id', 'popularity_sg', 'short_title',
       'tm_date', 'tm_event', 'listing_count', 'average_price', 'median_price',
       'lowest_price', 'headliner', 'is_ga.1', 'event_name', 'price_min',
       'price_max', 'max_tickets', 'venue_name', 'postalCode', 'city', 'state',
       'country', 'address', 'longitude', 'latitude', 'num_markets', 'genre',
       'subgenre', 'event_date', 'is_presale', 'days_event', 'day_of_week',
       'date'],
      dtype='object')

In [5]:
pop.zip_code

0        00210
1        00211
2        00212
3        00213
4        00214
         ...  
42100    99714
42101    99716
42102    99725
42103    99775
42104    99790
Name: zip_code, Length: 42105, dtype: object

In [6]:
merged = pd.merge(left=df,right=pop,how='left', left_on='postalCode',right_on='zip_code')

In [7]:
merged.pop_2018.isna()

0       False
1       False
2       False
3       False
4       False
        ...  
6237    False
6238    False
6239    False
6240    False
6241    False
Name: pop_2018, Length: 6242, dtype: bool

In [8]:
venues = pd.read_csv('scraping/cleaned_venues.csv')

In [9]:
merged.venue_name

0       Mercury Ballroom
1        Madison Theater
2       Mercury Ballroom
3       Mercury Ballroom
4       Mercury Ballroom
              ...       
6237    Mercury Ballroom
6238       Madison Live!
6239    Mercury Ballroom
6240       Madison Live!
6241       Madison Live!
Name: venue_name, Length: 6242, dtype: object

In [10]:
from fuzzywuzzy import fuzz
name = 'temp'
temp = 0
d = {}
for merged_venue in list(set(merged.venue_name)):
    for cap_venue in list(set(venues.Venue)):
        ratio = fuzz.token_sort_ratio(merged_venue, cap_venue)
        if ratio > temp:
            temp = ratio
            name = cap_venue
    if temp >= 80:
        d[merged_venue] = name
        temp = 0

In [11]:
def try_fuzz(x):
    try:
        return d[x]
    except:
        return np.nan

In [12]:
merged['venue_fuzz'] = merged.venue_name.apply(lambda x: try_fuzz(x))

In [13]:
venue_merge = pd.merge(left=merged,right=venues,how='left', left_on='venue_fuzz',right_on='Venue')

In [14]:
venue_merge.Capacity.isna().sum()

3814

In [15]:
venue_merge.shape

(6871, 44)

In [16]:
venue_merge.columns

Index(['datetime_local', 'is_ga', 'id', 'popularity_sg', 'short_title',
       'tm_date', 'tm_event', 'listing_count', 'average_price', 'median_price',
       'lowest_price', 'headliner', 'is_ga.1', 'event_name', 'price_min',
       'price_max', 'max_tickets', 'venue_name', 'postalCode', 'city',
       'state_x', 'country', 'address', 'longitude', 'latitude', 'num_markets',
       'genre', 'subgenre', 'event_date', 'is_presale', 'days_event',
       'day_of_week', 'date', 'zip_code', 'state_y', 'msa_num', 'msa_name_x',
       'msa_name_fuzz', 'msa_name_y', 'pop_2018', 'venue_fuzz', 'Venue',
       'City', 'Capacity'],
      dtype='object')

In [17]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

client_id='38e94de386924d73b4098a6a28d45070'
client_secret='27899e6f31894a6aa3c572268f8658e5'

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
spot = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [18]:
popularity = []
for artist in tqdm(venue_merge.headliner):
    results = spot.search(q='artist:' + artist, type='artist')
    try:
        x = results['artists']['items'][0]['popularity']
        popularity.append(x)
    except:
        popularity.append(np.nan)

100%|██████████| 6871/6871 [12:41<00:00,  9.02it/s]


In [19]:
venue_merge['spotify_popularity'] = popularity

In [20]:
venue_merge.columns

Index(['datetime_local', 'is_ga', 'id', 'popularity_sg', 'short_title',
       'tm_date', 'tm_event', 'listing_count', 'average_price', 'median_price',
       'lowest_price', 'headliner', 'is_ga.1', 'event_name', 'price_min',
       'price_max', 'max_tickets', 'venue_name', 'postalCode', 'city',
       'state_x', 'country', 'address', 'longitude', 'latitude', 'num_markets',
       'genre', 'subgenre', 'event_date', 'is_presale', 'days_event',
       'day_of_week', 'date', 'zip_code', 'state_y', 'msa_num', 'msa_name_x',
       'msa_name_fuzz', 'msa_name_y', 'pop_2018', 'venue_fuzz', 'Venue',
       'City', 'Capacity', 'spotify_popularity'],
      dtype='object')

In [21]:
train_df = venue_merge[['is_ga', 'average_price', 'median_price','lowest_price',
                        'headliner', 'price_min','price_max', 'max_tickets','longitude',
                        'latitude', 'num_markets','is_presale', 'days_event','day_of_week', 
                        'Capacity', 'spotify_popularity']].copy()

In [22]:
train_df.rename(columns = {'average_price':'resale_average_price','median_price':'resale_median_price',
                           'lowest_price':'resale_lowest_price','price_min':'tm_price_min','price_max':'tm_price_max'},
               inplace=True)

In [23]:
train_df['tm_avg_price']= (train_df.tm_price_max+train_df.tm_price_min)/2

In [24]:
train_df['target'] = (train_df.resale_median_price - train_df.tm_avg_price)/train_df.tm_avg_price

In [25]:
train_df

Unnamed: 0,is_ga,resale_average_price,resale_median_price,resale_lowest_price,headliner,tm_price_min,tm_price_max,max_tickets,longitude,latitude,num_markets,is_presale,days_event,day_of_week,Capacity,spotify_popularity,tm_avg_price,target
0,False,140.0,77.0,41.0,Cherub,20.0,20.0,10,-85.757890,38.248514,1.0,1,70.0,5.0,,58.0,20.00,2.850000
1,False,192.0,73.0,64.0,Gregory Alan Isakov,24.5,29.0,10,-84.509506,39.082688,1.0,1,91.0,5.0,,68.0,26.75,1.728972
2,False,93.0,46.0,27.0,Badfish,15.0,15.0,10,-85.757890,38.248514,1.0,1,152.0,3.0,,13.0,15.00,2.066667
3,False,38.0,28.0,20.0,The Four Horsemen,10.0,10.0,10,-85.757890,38.248514,1.0,1,70.0,5.0,,24.0,10.00,1.800000
4,False,111.0,72.0,31.0,Dzeko,20.0,20.0,10,-85.757890,38.248514,1.0,1,109.0,1.0,,71.0,20.00,2.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6866,False,75.0,58.0,29.0,The Iron Maidens,13.0,13.0,10,-85.757890,38.248514,1.0,1,154.0,5.0,,,13.00,3.461538
6867,True,44.0,39.0,38.0,Bumpin Uglies,12.0,15.0,10,-84.509719,39.082536,1.0,0,67.0,2.0,,51.0,13.50,1.888889
6868,False,40.0,34.0,24.0,That Arena Rock Show,10.0,10.0,10,-85.757890,38.248514,1.0,1,102.0,2.0,,,10.00,2.400000
6869,True,53.0,44.0,41.0,Aqueous,15.0,18.0,10,-84.509719,39.082536,1.0,0,121.0,4.0,,29.0,16.50,1.666667


In [26]:
train_df.columns

Index(['is_ga', 'resale_average_price', 'resale_median_price',
       'resale_lowest_price', 'headliner', 'tm_price_min', 'tm_price_max',
       'max_tickets', 'longitude', 'latitude', 'num_markets', 'is_presale',
       'days_event', 'day_of_week', 'Capacity', 'spotify_popularity',
       'tm_avg_price', 'target'],
      dtype='object')

In [27]:
train_df.drop(columns=['resale_average_price', 'resale_median_price',
       'resale_lowest_price','tm_price_min', 'tm_price_max'],inplace=True)

In [28]:
train_df.drop(columns=['headliner','longitude','latitude'],inplace=True)

In [29]:
train_df.is_ga = train_df.is_ga.astype('int')
train_df.max_tickets.replace('6 for presales', inplace=True)
train_df.max_tickets = train_df.max_tickets.astype('int')

In [30]:
from tpot import TPOTClassifier, TPOTRegressor
import xgboost

In [31]:
train_df['bool_target'] = train_df.target.apply(lambda x: 1 if x >= 1.3 else 0)

In [32]:
train_df.bool_target.sum()/train_df.shape[0]

0.4080919807888226

In [33]:
train_df.dropna(subset=['spotify_popularity'],inplace=True)
train_df.dropna(subset=['num_markets'],inplace=True)
train_df.dropna(subset=['target'],inplace=True)

In [34]:
X = train_df.drop(columns=['bool_target','target','day_of_week','Capacity'])
y = train_df.bool_target

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [37]:
pipeline_optimizer = TPOTClassifier(generations=10, population_size=10, cv=5,
                                    random_state=42, verbosity=2, n_jobs=-1)

In [38]:
pipeline_optimizer.fit(X_train, y_train)

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=110, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: 0.7965736751556967
Generation 2 - Current best internal CV score: 0.7965736751556967
Generation 3 - Current best internal CV score: 0.7965736751556967
Generation 4 - Current best internal CV score: 0.7968012064493719
Generation 5 - Current best internal CV score: 0.7988565186571966
Generation 6 - Current best internal CV score: 0.8013671537836171
Generation 7 - Current best internal CV score: 0.8013671537836171
Generation 8 - Current best internal CV score: 0.805705320203713
Generation 9 - Current best internal CV score: 0.805705320203713
Generation 10 - Current best internal CV score: 0.805705320203713

Best pipeline: ExtraTreesClassifier(CombineDFs(input_matrix, ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.6000000000000001, min_samples_leaf=18, min_samples_split=15, n_estimators=100)), bootstrap=False, criterion=entropy, max_features=0.6500000000000001, min_samples_leaf=3, min_samples_split=20, n_esti

TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=10,
               max_eval_time_mins=5, max_time_mins=None, memory=None,
               mutation_rate=0.9, n_jobs=-1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=10,
               random_state=42, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)

In [39]:
pipeline_optimizer.export('concert_pipeline_classifier.py')

In [43]:
from concert_pipeline_classifier import exported_pipeline as pipeline

In [45]:
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('featureunion',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('functiontransformer',
                                                 FunctionTransformer(accept_sparse=False,
                                                                     check_inverse=True,
                                                                     func=<function copy at 0x1008e6dd0>,
                                                                     inv_kw_args=None,
                                                                     inverse_func=None,
                                                                     kw_args=None,
                                                                     pass_y='deprecated',
                                                                     validate=None)),
                                                ('stackingestimator',
                                         

In [46]:
results = pipeline.predict(X_test)



In [47]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [48]:
print(classification_report(results,y_test))

              precision    recall  f1-score   support

           0       0.87      0.79      0.83       924
           1       0.69      0.80      0.74       536

    accuracy                           0.79      1460
   macro avg       0.78      0.80      0.78      1460
weighted avg       0.80      0.79      0.80      1460



In [50]:
confusion_matrix(results,y_test)

array([[728, 196],
       [106, 430]])

In [51]:
from xgboost import XGBClassifier

In [52]:
xg = XGBClassifier(n_jobs=-1,max_depth=10, n_estimators=5000)

In [53]:
xg.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=5000, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [56]:
xg_pred = xg.predict(X_test)

In [57]:
print(classification_report(xg_pred,y_test))

              precision    recall  f1-score   support

           0       0.84      0.80      0.82       878
           1       0.72      0.77      0.75       582

    accuracy                           0.79      1460
   macro avg       0.78      0.79      0.78      1460
weighted avg       0.79      0.79      0.79      1460



In [58]:
confusion_matrix(xg_pred,y_test)

array([[703, 175],
       [131, 451]])

In [60]:
accuracy_score(xg_pred,y_test)

0.7904109589041096