In [1]:
import pandas as pd
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

In [2]:
path = 'Pickles/output_df.pkl'
df = pd.read_pickle(path)

## Table of Contents
    1. Continuous Features (Spotify Info, Ticket Listings, Time Info, Artist Count) Regression Analysis (Pearson & Spearman Correlations)
    2. Categorical Features Analysis (TTest, ANOVA, Tukey HSD)
        2.1 Genre
        2.2 Subgenre
        2.3 Day of Week
        2.4 Promoter
        2.5 Ticket Source
        2.6 Venue State
    
### Notes: 
    1. All null hypotheses are accepoted/rejected at the $\alpha =.05 $ signficance level for two-tailed p-values
    2. Samples in each category have unequal variances and sample sizes, suggesting the ANOVA F-test may not be the most ideal statistical tool. However, it used for the sake of simplicity as the final product of this project is not inter-categorical differnces.

In [3]:
continuous = ['avg_ticket_listings','spotify_avg_followers','spotify_avg_popularity',
              'presale_length', 'days_on_sale','days_until_show','artist_count']
categorical = ['genre', 'subGenre','day_of_week','promoter','min_source']

## 1. Correlations between Continuous Features & Min Markup
I will calculate Pearson R and Spearman R values because, as previously observed the correlations may not necessarily be linear. <br><br>
$H_0 : r = 0$ There is no statistically significant correlation between each continuous variable and ticket minimum markup<br>
$H_a : r \neq 0$ There is a statistically significant correlation between each continuous variable and ticket minimum markup

In [4]:
#Compute Pearson R values for continuous features and minmarkup%
pearson_r = list()
pearson_p = list()
for col in continuous:
    r,p = stats.pearsonr(df[col],df['min_markup%'])
    pearson_r.append(r)
    pearson_p.append(p)
    
#Compute Spearman R values for continuous features and minmarkup%    
spearman_r = list()
spearman_p = list()
for col in continuous:
    r,p = stats.spearmanr(df[col],df['min_markup%'])
    spearman_r.append(r)
    spearman_p.append(p)
    
#Create DF of values
r_df = pd.DataFrame({'Column' : continuous,
                    'PearsonR' : pearson_r,
                    'PearsonR_pvalue' : pearson_p,
                    'SpearmanR' : spearman_r,
                    'SpearmanR_pvalue' : spearman_p})

#Accept or reject null hypothesis at alpha = .05 significance
r_df['Pearson_NullHypothesis'] = r_df['PearsonR_pvalue'].map(lambda x: 'Reject' if x<.05 else 'Accept')
r_df['Spearman_NullHypothesis'] = r_df['SpearmanR_pvalue'].map(lambda x: 'Reject' if x<.05 else 'Accept')

#Reorder columns
r_df = r_df[['Column','PearsonR','PearsonR_pvalue','Pearson_NullHypothesis','SpearmanR','SpearmanR_pvalue','Spearman_NullHypothesis']]

r_df

Unnamed: 0,Column,PearsonR,PearsonR_pvalue,Pearson_NullHypothesis,SpearmanR,SpearmanR_pvalue,Spearman_NullHypothesis
0,avg_ticket_listings,-0.094553,6.372505e-09,Reject,-0.199221,6.176388e-35,Reject
1,spotify_avg_followers,0.049878,0.00222745,Reject,-0.092131,1.537915e-08,Reject
2,spotify_avg_popularity,0.049662,0.002327623,Reject,-0.091791,1.737195e-08,Reject
3,presale_length,-0.077672,1.874547e-06,Reject,-0.180006,9.953438e-29,Reject
4,days_on_sale,-0.045595,0.005186303,Reject,-0.167394,5.163824e-25,Reject
5,days_until_show,-0.073715,6.098756e-06,Reject,0.013097,0.4222535,Accept
6,artist_count,0.010766,0.5094595,Accept,0.010364,0.525406,Accept


## Conclusions

All Pearson R values are statistically significant excluding, and Artist count. All Spearman R values are statistically significant excluding Days until show and Artist count. This suggests that artist count, and potentially Days until Show, are not relevant factors.

Excluding Days Until Show, and Artist Count, all Spearman R values are higher than the Pearson values. This indicates that there is in fact associations in the movements between features, but that the movements aren't linear. This corroborates with the ECDF visualisations which suggested a logarithmic association between markup and the features. 

## 2. Correlations between Categorical Features & Min Markup


#### Define functions

In [5]:
#Define function for Welch's ttest comparing each category of a feature to that of overall markup
def t_test(input_df,alpha=.05):
    t_list = list()
    p_list = list()
    for col in input_df.columns:
        tvalue, pvalue = stats.ttest_ind(input_df[col],df['min_markup%'],equal_var=False,nan_policy='omit')
        t_list.append(tvalue)
        p_list.append(pvalue)
    # Create a dataframe of the categories and their respective T statistics, and P Values    
    ttest_df = pd.DataFrame({'Categories' : input_df.columns,
                                   'T Stat' : t_list,
                                   'P Value': p_list})
    # Accept or reject the null hypothesis at the given alpha significance level
    ttest_df['NullHypothesis'] = ttest_df['P Value'].map(lambda x: 'Reject' if x<alpha else 'Accept')
    return ttest_df

#Define a function for Tukey HSD multicomparison
def MCTukey(category):
    mc = MultiComparison(df['min_markup%'],df[category])
    result = mc.tukeyhsd()
    print(result)

## 2.1 Genre

In [6]:
genre_df = df[['min_markup%','genre']].pivot(columns='genre',values='min_markup%')
genre_df.describe()

genre,Blues,Country,Dance/Electronic,Folk,Hip-Hop/Rap,Jazz,Metal,Other,Pop,R&B,Religious,Rock,Undefined,World
count,76.0,296.0,133.0,68.0,172.0,97.0,111.0,122.0,258.0,266.0,84.0,1511.0,361.0,202.0
mean,185.209334,179.823589,120.306847,203.602444,148.520164,130.704343,150.435346,168.526471,76.977162,145.138177,121.274377,207.432196,116.82925,101.245218
std,85.960636,476.529504,109.885481,279.84442,151.323897,76.338896,218.027912,149.22736,73.596679,128.726938,67.698992,441.677863,133.306952,71.83423
min,58.8,-72.537313,-32.2,13.333333,-6.177215,-9.963636,-5.276995,-8.108108,-43.396226,9.539326,-18.0,-70.446927,-2.857143,-70.0
25%,114.009286,74.034351,69.929577,86.022727,83.357092,88.888889,87.706522,93.679487,18.42296,81.818182,101.666667,79.868058,73.996176,62.935897
50%,162.360447,98.010473,93.7,105.263158,114.642857,100.6,110.75,123.45805,61.902216,117.00624,104.87234,106.372881,85.719225,89.337121
75%,286.666667,143.977273,121.892857,154.160792,167.118333,142.6,137.297895,200.9375,99.577465,154.330339,136.090909,159.400833,123.220339,109.659994
max,356.54321,6579.0,840.0,1676.923077,1440.0,373.287671,2210.0,1218.872727,425.0,1109.102041,583.04,4703.333333,1488.0,442.857143


### T-Test
Test if each genre's min markup is statistically different from that of the mean

$H_0 : \mu_\bar{x} = \mu_g$ There is no statistically significant difference between each category and the overall ticket minimum markup.<br>
$H_0 : \mu_\bar{x} \neq \mu_g$ There is a statistically significant difference between each category and the overall ticket minimum markup.

In [7]:
ttest_genre_df = t_test(genre_df)
ttest_genre_df

Unnamed: 0,Categories,P Value,T Stat,NullHypothesis
0,Blues,0.06478643,1.863134,Accept
1,Country,0.5829145,0.549696,Accept
2,Dance/Electronic,7.626481e-05,-4.029536,Reject
3,Folk,0.2566926,1.143535,Accept
4,Hip-Hop/Rap,0.2151789,-1.242582,Accept
5,Jazz,0.0004413063,-3.571404,Reject
6,Metal,0.5171583,-0.649574,Accept
7,Other,0.7724175,0.289699,Accept
8,Pop,1.764594e-33,-12.414208,Reject
9,R&B,0.04461858,-2.012797,Reject


### T-Test Results:
The difference between each genre's markup and the markup mean is statistically significant at the $\alpha$ = .05 level for Dance/Electronic, Jazz, Pop, R&B, Religous, Rock, Undefined, and World.

### ANOVA

For each categorical feature, an ANOVA test will be used to determine the statistical signficance of each category on ticket minimum markups

$H_0 : \mu_1 = \mu_2 = \mu_3 ...$ There is no statistically significant difference between each category's minimum ticket markup<br>
$H_a : \mu_1 \neq \mu_2 \neq \mu_3 ...$ There is a statistically significant difference between each category's minimum ticket markup

In [8]:
stats.f_oneway(genre_df['Blues'].dropna(),
               genre_df['Country'].dropna(),
               genre_df['Dance/Electronic'].dropna(),
               genre_df['Folk'].dropna(),
               genre_df['Hip-Hop/Rap'].dropna(),
               genre_df['Jazz'].dropna(),
               genre_df['Metal'].dropna(),
               genre_df['Other'].dropna(),
               genre_df['Pop'].dropna(),
               genre_df['R&B'].dropna(),
               genre_df['Religious'].dropna(),
               genre_df['Rock'].dropna(),
               genre_df['Undefined'].dropna(),
               genre_df['World'].dropna(),)

F_onewayResult(statistic=5.315216039384361, pvalue=1.4712118660348582e-09)

In [9]:
MCTukey('genre')

          Multiple Comparison of Means - Tukey HSD,FWER=0.05         
     group1           group2       meandiff   lower    upper   reject
---------------------------------------------------------------------
     Blues           Country       -5.3857  -145.5703 134.7988 False 
     Blues       Dance/Electronic  -64.9025 -221.6576 91.8526  False 
     Blues             Folk        18.3931  -163.5775 200.3637 False 
     Blues         Hip-Hop/Rap     -36.6892 -186.8429 113.4645 False 
     Blues             Jazz        -54.505  -221.5031 112.4931 False 
     Blues            Metal        -34.774  -197.0797 127.5317 False 
     Blues            Other        -16.6829  -175.987 142.6213 False 
     Blues             Pop        -108.2322 -250.5102 34.0458  False 
     Blues             R&B         -40.0712 -181.8615 101.7192 False 
     Blues          Religious      -63.935  -236.5167 108.6467 False 
     Blues             Rock        22.2229  -105.9307 150.3764 False 
     Blues          

Pop & Country, Pop & Rock, Rock & Undefined, Rock & World have significantly different means, the rest of the means do not have significant differences.

## 2.2 SubGenre

### T Test

In [10]:
subgenre_df = df[['min_markup%','subGenre']].pivot(columns='subGenre',values='min_markup%')
subgenre_df.describe()

subGenre,Adult Contemporary,Alternative Rock,Blues,Club Dance,Country,Folk,Gospel,Heavy Metal,Jazz,Latin,Other,Pop,R&B,Soul,Undefined,Urban,World
count,81.0,426.0,76.0,117.0,278.0,65.0,75.0,107.0,97.0,125.0,278.0,1198.0,172.0,80.0,361.0,150.0,71.0
mean,117.300024,160.195889,185.209334,124.123362,180.968324,207.438043,118.388102,151.29349,130.704343,94.263078,150.90374,206.121614,137.830055,161.681964,116.82925,151.460394,102.218419
std,50.638642,167.388309,85.960636,116.105926,490.63628,285.684126,69.18274,222.030101,76.338896,69.488109,131.772998,488.147304,146.321178,77.32532,133.306952,157.563337,55.400666
min,17.168142,-68.0,58.8,-32.2,-72.537313,13.333333,-18.0,-5.276995,-9.963636,-22.984597,-8.108108,-70.446927,9.539326,22.773109,-2.857143,-6.177215,-70.0
25%,99.577465,81.219737,114.009286,69.929577,72.096963,86.363636,101.333333,86.556452,88.888889,54.94382,88.964646,67.307191,75.416667,125.0,73.996176,84.068554,78.888889
50%,99.577465,110.084746,162.360447,93.7,97.851478,105.263158,104.553191,109.090909,100.6,80.831933,109.090909,98.72381,90.151515,135.0,85.719225,115.333333,96.939891
75%,126.728972,166.761966,286.666667,123.08,143.293004,156.643167,119.162905,137.297895,142.6,104.216867,163.566667,144.835,134.070513,187.05,123.220339,173.343333,111.551224
max,391.891892,1333.333333,356.54321,840.0,6579.0,1676.923077,583.04,2210.0,373.287671,414.762516,1218.872727,4703.333333,1109.102041,600.0,1488.0,1440.0,395.533333


In [11]:
subgenre_ttest_df = t_test(subgenre_df)
subgenre_ttest_df

Unnamed: 0,Categories,P Value,T Stat,NullHypothesis
0,Adult Contemporary,4.273313e-09,-6.061783,Reject
1,Alternative Rock,0.6712812,-0.424537,Accept
2,Blues,0.06478643,1.863134,Accept
3,Club Dance,0.0009751105,-3.352771,Reject
4,Country,0.5781262,0.55674,Accept
5,Folk,0.2330972,1.203304,Accept
6,Gospel,4.051981e-06,-4.780173,Reject
7,Heavy Metal,0.5570728,-0.588851,Accept
8,Jazz,0.0004413063,-3.571404,Reject
9,Latin,3.343549e-16,-8.550041,Reject


### Results:
Reject the null hypothesis for Adult Contemporary, Club Dance, Gospel, Jazz, Latin, Pop, R&B, Undefined, and World subgenres.
### ANOVA

In [12]:
stats.f_oneway(subgenre_df['Adult Contemporary'].dropna(),
               subgenre_df['Alternative Rock'].dropna(),
               subgenre_df['Blues'].dropna(),
               subgenre_df['Club Dance'].dropna(),
               subgenre_df['Country'].dropna(),
               subgenre_df['Folk'].dropna(),
               subgenre_df['Gospel'].dropna(),
               subgenre_df['Heavy Metal'].dropna(),
               subgenre_df['Jazz'].dropna(),
               subgenre_df['Latin'].dropna(),
               subgenre_df['Other'].dropna(),
               subgenre_df['Pop'].dropna(),
               subgenre_df['R&B'].dropna(),
               subgenre_df['Soul'].dropna(),
               subgenre_df['Undefined'].dropna(),
               subgenre_df['Urban'].dropna(),
               subgenre_df['World'].dropna())

F_onewayResult(statistic=2.8727032334771003, pvalue=0.00010571892638891329)

In [13]:
MCTukey('subGenre')

           Multiple Comparison of Means - Tukey HSD,FWER=0.05          
      group1            group2       meandiff   lower    upper   reject
-----------------------------------------------------------------------
Adult Contemporary Alternative Rock  42.8959   -93.8504 179.6421 False 
Adult Contemporary      Blues        67.9093  -112.2511 248.0697 False 
Adult Contemporary    Club Dance      6.8233  -156.2398 169.8865 False 
Adult Contemporary     Country       63.6683   -78.7747 206.1113 False 
Adult Contemporary       Folk         90.138   -97.7227 277.9987 False 
Adult Contemporary      Gospel        1.0881  -179.6909 181.867  False 
Adult Contemporary   Heavy Metal     33.9935  -132.1576 200.1446 False 
Adult Contemporary       Jazz        13.4043  -156.3967 183.2054 False 
Adult Contemporary      Latin        -23.0369 -183.9513 137.8774 False 
Adult Contemporary      Other        33.6037  -108.8393 176.0467 False 
Adult Contemporary       Pop         88.8216   -40.6943 218.3375

### Results
Latin & Pop, Pop & Soul have significantly different means at the $\alpha = .05$ significance level.

## 2.3. Day of the Week
Perform T-tests and ANOVA on markups vs days of the week

In [14]:
day_df = df[['min_markup%','day_of_week']].pivot(columns='day_of_week',values='min_markup%')
day_df.describe()

day_of_week,0,1,2,3,4,5,6
count,351.0,198.0,328.0,430.0,561.0,959.0,930.0
mean,127.26886,127.951204,192.360805,164.204613,189.828697,167.090955,157.95753
std,132.007626,130.447346,411.685694,329.670099,463.586576,325.029789,270.099024
min,-72.537313,17.828571,-28.019802,-70.0,-68.0,-70.446927,-60.0
25%,69.722222,64.594937,80.0,73.996176,73.996176,79.5566,78.59011
50%,96.389831,95.3,101.497561,99.577465,97.885714,105.2,103.578182
75%,146.675439,122.244949,160.750836,146.623333,149.92,148.152542,143.013175
max,1676.923077,920.338983,3901.733333,2943.32,6579.0,4500.0,3223.333333


### T Test

In [15]:
day_ttest_df = t_test(day_df)
day_ttest_df

Unnamed: 0,Categories,P Value,T Stat,NullHypothesis
0,0,3.1e-05,-4.190914,Reject
1,1,0.000754,-3.399404,Reject
2,2,0.230543,1.200971,Accept
3,3,0.994609,-0.00676,Accept
4,4,0.209048,1.257444,Accept
5,5,0.813866,0.235484,Accept
6,6,0.538616,-0.615033,Accept


### Results

The null hypothesis is rejected for Sunday and Monday, which have min markups which have differences statistically significant from the mean (Below the mean). For Tuesday through Saturday, I fail to reject the null hypothesis.

### ANOVA

In [16]:
stats.f_oneway(day_df[0].dropna(),
               day_df[1].dropna(),
               day_df[2].dropna(),
               day_df[3].dropna(),
               day_df[4].dropna(),
               day_df[5].dropna(),
               day_df[6].dropna())

F_onewayResult(statistic=2.2012121035066947, pvalue=0.040099700414595839)

In [17]:
MCTukey('day_of_week')

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower    upper   reject
-----------------------------------------------
  0      1     0.6823  -85.0354   86.4   False 
  0      2    65.0919  -8.9734  139.1573 False 
  0      3    36.9358  -32.4401 106.3116 False 
  0      4    62.5598  -3.0748  128.1944 False 
  0      5    39.8221  -20.3428  99.987  False 
  0      6    30.6887  -29.7271 91.1044  False 
  1      2    64.4096  -22.3852 151.2044 False 
  1      3    36.2534  -46.5758 119.0826 False 
  1      4    61.8775  -17.8444 141.5993 False 
  1      5    39.1398  -36.143  114.4225 False 
  1      6    30.0063  -45.477  105.4896 False 
  2      3    -28.1562 -98.8586 42.5462  False 
  2      4    -2.5321  -69.5673 64.5031  False 
  2      5    -25.2698 -86.9597  36.42   False 
  2      6    -34.4033 -96.3377 27.5312  False 
  3      4    25.6241  -36.1906 87.4388  False 
  3      5     2.8863  -53.0866 58.8593  False 
  3      6    -6.2471  -62.4895 49.99

### Results
Markups over different days of the week are not statistically signifcant from each other.

## 2.4 Promoter
Compare min markups between the mean and different promoters

In [18]:
promoter_df = df[['min_markup%','promoter']].pivot(columns='promoter',values='min_markup%')
promoter_df.describe()

promoter,AEG LIVE,CROSSROADS PRESENTS,FRANK PRODUCTIONS,HOUSE OF BLUES CONCERTS (HOB),LIVE NATION MUSIC,MASQUERADE,Other,PROMO WEST,PROMOTED BY VENUE
count,46.0,68.0,30.0,241.0,1025.0,50.0,263.0,84.0,1950.0
mean,84.502143,125.560061,118.787754,143.206437,130.760679,441.074746,131.368594,152.939228,186.338886
std,66.601313,95.075216,122.797529,166.040971,128.373196,450.354205,157.545472,118.774049,425.643091
min,-45.896907,16.260163,-12.974684,-21.621622,-45.454545,73.913043,-70.446927,13.718231,-72.537313
25%,57.15018,92.72,66.829604,83.2,74.227848,108.272727,80.0,84.223238,73.996176
50%,80.156705,106.25,98.222222,107.728814,100.0,261.449275,96.939891,113.5,100.553922
75%,111.740662,119.969618,133.210438,143.902439,137.5,609.583333,129.948951,148.076923,153.401826
max,357.142857,686.5,693.52381,1488.0,1334.615385,2210.0,1676.923077,600.0,6579.0


### T Test

In [19]:
promoter_ttest_df = t_test(promoter_df)
promoter_ttest_df

Unnamed: 0,Categories,P Value,T Stat,NullHypothesis
0,AEG LIVE,4.916276e-10,-7.140832,Reject
1,CROSSROADS PRESENTS,0.002934446,-3.050427,Reject
2,FRANK PRODUCTIONS,0.05677725,-1.975572,Accept
3,HOUSE OF BLUES CONCERTS (HOB),0.07820644,-1.766044,Accept
4,LIVE NATION MUSIC,5.220407e-07,-5.025786,Reject
5,MASQUERADE,7.213618e-05,4.330205,Reject
6,Other,0.003116455,-2.9724,Reject
7,PROMO WEST,0.4185832,-0.81184,Accept
8,PROMOTED BY VENUE,0.04574907,1.998487,Reject


### Results
Most promoter categories have statistically significant differences from the overall mean.
The null hypothesis is rejected for AEG Live, Crossroad Presents, Live Nation, and 'Other' for having markups significantly below the mean, and Masquerade, and 'Promoted by Venue' having markups significantly greater than the mean.

In [20]:
stats.f_oneway(promoter_df['AEG LIVE'].dropna(),
               promoter_df['CROSSROADS PRESENTS'].dropna(),
               promoter_df['FRANK PRODUCTIONS'].dropna(),
               promoter_df['HOUSE OF BLUES CONCERTS (HOB)'].dropna(),
               promoter_df['LIVE NATION MUSIC'].dropna(),
               promoter_df['MASQUERADE'].dropna(),
               promoter_df['Other'].dropna(),
               promoter_df['PROMO WEST'].dropna(),
               promoter_df['PROMOTED BY VENUE'].dropna())

F_onewayResult(statistic=8.0451726793583678, pvalue=8.084233883556036e-11)

In [21]:
MCTukey('promoter')

                       Multiple Comparison of Means - Tukey HSD,FWER=0.05                       
            group1                        group2             meandiff   lower     upper   reject
------------------------------------------------------------------------------------------------
           AEG LIVE                CROSSROADS PRESENTS       41.0579  -151.3955  233.5113 False 
           AEG LIVE                 FRANK PRODUCTIONS        34.2856   -202.292  270.8633 False 
           AEG LIVE           HOUSE OF BLUES CONCERTS (HOB)  58.7043  -103.4991  220.9077 False 
           AEG LIVE                 LIVE NATION MUSIC        46.2585  -105.6773  198.1944 False 
           AEG LIVE                     MASQUERADE           356.5726  150.6148  562.5304  True 
           AEG LIVE                       Other              46.8665  -114.2459  207.9788 False 
           AEG LIVE                     PROMO WEST           68.4371  -116.4726  253.3467 False 
           AEG LIVE           

### Results
I fail to accept the null hypothesis for all pairings except those including Masquerade. Masquerade's minimum markups are statistically signifcantly higher than any other promoter category.

## 2.5 Ticket Source
Compare minimum markups between SeatGeek and Stubhub

In [22]:
source_df = df[['min_markup%','min_source']].pivot(columns='min_source',values='min_markup%')
source_df.describe()

min_source,Both,SG,SH
count,1180.0,1136.0,1441.0
mean,130.007615,159.970547,195.841177
std,111.400801,219.581061,478.744363
min,-46.494898,-70.0,-72.537313
25%,78.708375,84.0,64.674157
50%,103.64596,106.25,96.939891
75%,145.157353,150.0,147.111111
max,1334.615385,4500.0,6579.0


### T test

In [23]:
source_ttest_df = t_test(source_df)
source_ttest_df

Unnamed: 0,Categories,P Value,T Stat,NullHypothesis
0,Both,4.162424e-08,-5.492507,Reject
1,SG,0.6058023,-0.516133,Accept
2,SH,0.02145163,2.30177,Reject


### Results
Events with tickets only on Stubhub have significantly higher markups than those of the dataset. Events with tickets on both Stubhub and SeatGeek have significantly lower markups.

### ANOVA

In [24]:
stats.f_oneway(source_df['Both'].dropna(),
               source_df['SG'].dropna(),
               source_df['SH'].dropna(),)

F_onewayResult(statistic=13.35853429728107, pvalue=1.6558061577801399e-06)

In [25]:
MCTukey('min_source')

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower   upper  reject
---------------------------------------------
 Both    SG   29.9629  -1.8258 61.7517 False 
 Both    SH   65.8336  35.8078 95.8594  True 
  SG     SH   35.8706   5.5268 66.2144  True 
---------------------------------------------


### Results
Differences in markups between are significantly different for tickets on both platforms vs. Stubhub, and SeatGeek vs. Stubhub. 

The difference in markups between both platforms vs. SeatGeek is not statistically significant.
## 2.6 Venue State

In [26]:
state_df = df[['min_markup%','venue_state']].pivot(columns='venue_state',values='min_markup%')
state_df.describe()

venue_state,CA,FL,GA,IL,IN,KY,LA,MA,MD,ME,...,MO,NC,NJ,NV,NY,OH,PA,TN,TX,VA
count,294.0,254.0,154.0,99.0,114.0,69.0,60.0,167.0,65.0,69.0,...,92.0,146.0,152.0,462.0,609.0,246.0,172.0,89.0,146.0,98.0
mean,130.904461,125.456031,230.395338,133.34714,180.582879,101.120705,223.175604,140.426366,195.393694,153.654532,...,217.476806,140.032476,169.100445,76.333904,303.79606,145.578582,112.653423,188.206978,108.172882,95.813852
std,100.136836,130.578813,320.089274,123.927032,429.736722,108.683445,249.381975,135.216978,154.086126,118.278616,...,685.553448,133.787264,215.952233,70.692027,650.003247,111.606937,94.504527,253.095863,62.301806,47.495154
min,-50.266667,-46.494898,-34.736842,-56.977778,-45.896907,-12.974684,-4.528302,-29.72973,43.785714,47.945205,...,-55.485714,15.877778,-11.101695,11.057143,-60.0,11.111111,-70.446927,-72.537313,-36.610169,22.067039
25%,79.25,78.329534,84.687954,76.981857,75.459394,34.4375,88.204545,80.55,96.860465,80.0,...,80.077143,88.319328,79.385793,18.42296,90.0,87.136722,82.089161,82.531646,73.294737,74.170715
50%,105.55082,100.435505,104.625,121.485714,105.263158,79.974064,140.440128,104.714286,149.857143,106.285714,...,102.152727,108.807313,105.016393,73.996176,121.428571,112.166667,100.0,120.971429,101.88437,84.87395
75%,150.464286,130.706835,202.438312,157.248276,145.448767,126.2,270.191176,144.702985,239.424242,184.638889,...,158.905639,141.694841,175.214286,93.647784,186.5,154.718478,125.0,215.297619,131.934343,100.0
max,605.538462,1676.923077,2210.0,1065.0,4500.0,614.833333,1488.0,1297.142857,926.666667,600.0,...,6579.0,1140.4,1446.666667,642.244898,4703.333333,760.0,693.52381,1239.5,328.115942,373.697479


### T test

In [27]:
state_ttest_df = t_test(state_df)
state_ttest_df

Unnamed: 0,Categories,P Value,T Stat,NullHypothesis
0,CA,2.64891e-05,-4.222742,Reject
1,FL,8.09541e-05,-3.97391,Reject
2,GA,0.01307807,2.508604,Reject
3,IL,0.02381858,-2.285467,Reject
4,IN,0.6894434,0.400602,Accept
5,KY,2.197385e-05,-4.472137,Reject
6,LA,0.0761422,1.803525,Accept
7,MA,0.04296772,-2.03388,Reject
8,MD,0.1215894,1.566019,Accept
9,ME,0.48501,-0.701215,Accept


### Results
CA, FL, IL, KY, MA, MI, NC, NV, OH, PA, TX, and VA have markups significantly lower than the mean. GA, LA, and NY have markups significantly higher than the mean

### ANOVA

In [28]:
stats.f_oneway(state_df['CA'].dropna(),
               state_df['FL'].dropna(),
               state_df['GA'].dropna(),
               state_df['IL'].dropna(),
               state_df['IN'].dropna(),
               state_df['KY'].dropna(),
               state_df['LA'].dropna(),
               state_df['MA'].dropna(),
               state_df['MD'].dropna(),
               state_df['ME'].dropna(),
               state_df['MI'].dropna(),
               state_df['MO'].dropna(),
               state_df['NC'].dropna(),
               state_df['NJ'].dropna(),
               state_df['NV'].dropna(),
               state_df['NY'].dropna(),
               state_df['OH'].dropna(),
               state_df['PA'].dropna(),
               state_df['TN'].dropna(),
               state_df['TX'].dropna(),
               state_df['VA'].dropna())

F_onewayResult(statistic=9.5776874748861296, pvalue=3.6710584600058138e-29)

In [29]:
MCTukey('venue_state')

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2  meandiff   lower    upper   reject
-------------------------------------------------
  CA     FL    -5.4484   -103.369 92.4721  False 
  CA     GA    99.4909   -14.214  213.1958 False 
  CA     IL     2.4427  -130.3821 135.2675 False 
  CA     IN    49.6784   -76.4399 175.7967 False 
  CA     KY    -29.7838 -182.6914 123.1239 False 
  CA     LA    92.2711   -69.6586 254.2009 False 
  CA     MA     9.5219  -101.2406 120.2844 False 
  CA     MD    64.4892   -92.1827 221.1611 False 
  CA     ME    22.7501  -130.1576 175.6577 False 
  CA     MI     4.3172  -100.4558 109.0901 False 
  CA     MO    86.5723   -49.9803 223.125  False 
  CA     NC     9.128   -106.6032 124.8592 False 
  CA     NJ     38.196   -75.9988 152.3907 False 
  CA     NV    -54.5706 -139.8492 30.7081  False 
  CA     NY    172.8916  91.7141  254.0691  True 
  CA     OH    14.6741   -84.097  113.4452 False 
  CA     PA    -18.251   -127.982 91.4799  False 

### Results
CA & NY, FL & NY, GA & NV, IL & OH, IN & NY, KY & NY MA & NY, ME & NY, MI & NY, MO & NV, NC & NY, NJ & NY, NV & NY, NY & OH, NY & PA, NY & TX, NY & VA have statistically different means. New York has significantly higher markups than most other states, while Nevada has significantly lower markups than most states.