In [22]:
import pandas as pd
import statsmodels.api as sm
from scipy.stats import chi2

In [12]:
df = pd.read_csv("preprocessed_data.csv") 


In [13]:
df.columns

Index(['song_name', 'artist_name', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'year', 'country', 'language', 'style', 'gender',
       'main_singers', 'backing_dancers', 'backing_singers',
       'backing_instruments', 'semi_place', 'eng_lan', 'inc_eng_lan', 'Male',
       'Female', 'Pop', 'Rock', 'Ballad', 'Traditional', 'Dance', 'is_final'],
      dtype='object')

## Check significance of all countries

In [14]:
dummy_countries = pd.get_dummies(df['country'], prefix='country')
# Convert boolean values to integers (0/1)
dummy_countries = dummy_countries.astype(int)
df = pd.concat([df, dummy_countries], axis=1)

In [18]:
X = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'Pop', 'Rock', 'Ballad', 'Traditional', 'Dance',
        'main_singers', 'backing_dancers', 'backing_singers','backing_instruments',
        'eng_lan', 'inc_eng_lan',
        'country_Albania', 'country_Andorra',
       'country_Armenia', 'country_Australia', 'country_Austria',
       'country_Azerbaijan', 'country_Belarus', 'country_Belgium',
       'country_Bosnia and Herzegovina', 'country_Bulgaria', 'country_Croatia',
       'country_Cyprus', 'country_Czech Republic', 'country_Denmark',
       'country_Estonia', 'country_Finland', 
       'country_Georgia',  'country_Greece',
       'country_Hungary', 'country_Iceland', 'country_Ireland',
       'country_Israel',  'country_Latvia',
       'country_Lithuania', 'country_Malta', 'country_Moldova',
       'country_Montenegro', 'country_Netherlands', 'country_North Macedonia',
       'country_Norway', 'country_Poland', 'country_Portugal',
       'country_Romania', 'country_Russia', 'country_San Marino',
       'country_Serbia', 'country_Slovakia', 'country_Slovenia',
        'country_Sweden', 'country_Switzerland',
       'country_The Netherlands', 'country_Ukraine',
        'Male', 'Female']]
y = df['is_final']


X = sm.add_constant(X)
ur_model = sm.Logit(y, X)
ur_model = ur_model.fit()

print(ur_model.summary())

         Current function value: 0.540639
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:               is_final   No. Observations:                  483
Model:                          Logit   Df Residuals:                      416
Method:                           MLE   Df Model:                           66
Date:                Sat, 27 Apr 2024   Pseudo R-squ.:                  0.2062
Time:                        17:53:20   Log-Likelihood:                -261.13
converged:                      False   LL-Null:                       -328.94
Covariance Type:            nonrobust   LLR p-value:                 9.957e-07
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const                             36.8110   7.25e+06   5.07e-06      1.000   -1.42e+07    1.42e+07
dance



In [19]:
X = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'Pop', 'Rock', 'Ballad', 'Traditional', 'Dance',
        'main_singers', 'backing_dancers', 'backing_singers','backing_instruments',
        'eng_lan', 'inc_eng_lan',

        'Male', 'Female']]
y = df['is_final']


X = sm.add_constant(X)
r_model = sm.Logit(y, X)
r_model = r_model.fit()

print(r_model.summary())

         Current function value: 0.649603
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:               is_final   No. Observations:                  483
Model:                          Logit   Df Residuals:                      458
Method:                           MLE   Df Model:                           24
Date:                Sat, 27 Apr 2024   Pseudo R-squ.:                 0.04616
Time:                        17:53:56   Log-Likelihood:                -313.76
converged:                      False   LL-Null:                       -328.94
Covariance Type:            nonrobust   LLR p-value:                    0.1728
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  24.2172   1.11e+05      0.000      1.000   -2.18e+05    2.18e+05
danceability            0.4219      0.



In [21]:
LR = 2*(-261.13 + 313.76)
LR

105.25999999999999

In [26]:
degrees_freedom = len(ur_model.params) - len(r_model.params)
chi2_stat = chi2.ppf(0.95, degrees_freedom)

In [28]:
if LR > chi2_stat:
  print("Peject null hypothesis")
else:
  print("Do not reject null hypothesis")

Peject null hypothesis


Countries are significant.

## Check significance of years

In [29]:
dummy_years = pd.get_dummies(df['year'])
# Convert boolean values to integers (0/1)
dummy_years = dummy_years.astype(int)
df = pd.concat([df, dummy_years], axis=1)

In [30]:
X = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'main_singers', 'backing_dancers', 'backing_singers','backing_instruments',
        'eng_lan', 'inc_eng_lan',
        'Male', 'Female',  2009,  2010, 2011,2012,
                          2013,                    2014,
                          2015,                    2016,
                          2017,                    2018,
                          2019,                    2021,
                          2022,
        'Pop', 'Rock', 'Ballad', 'Traditional', 
        ]]
y = df['is_final']


X = sm.add_constant(X)
ur_model = sm.Logit(y, X)
ur_model = ur_model.fit()

print(ur_model.summary())

         Current function value: 0.646131
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:               is_final   No. Observations:                  483
Model:                          Logit   Df Residuals:                      446
Method:                           MLE   Df Model:                           36
Date:                Sat, 27 Apr 2024   Pseudo R-squ.:                 0.05126
Time:                        17:59:51   Log-Likelihood:                -312.08
converged:                      False   LL-Null:                       -328.94
Covariance Type:            nonrobust   LLR p-value:                    0.5773
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  20.2315   1.14e+04      0.002      0.999   -2.23e+04    2.23e+04
danceability            0.4134      0.



In [31]:
X = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'main_singers', 'backing_dancers', 'backing_singers','backing_instruments',
        'eng_lan', 'inc_eng_lan',
        'Male', 'Female',
        'Pop', 'Rock', 'Ballad', 'Traditional'
        ]]
y = df['is_final']


X = sm.add_constant(X)
r_model = sm.Logit(y, X)
r_model = r_model.fit()

print(r_model.summary())

         Current function value: 0.649703
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:               is_final   No. Observations:                  483
Model:                          Logit   Df Residuals:                      459
Method:                           MLE   Df Model:                           23
Date:                Sat, 27 Apr 2024   Pseudo R-squ.:                 0.04602
Time:                        18:00:27   Log-Likelihood:                -313.81
converged:                      False   LL-Null:                       -328.94
Covariance Type:            nonrobust   LLR p-value:                    0.1417
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  24.2140   1.04e+05      0.000      1.000   -2.03e+05    2.03e+05
danceability            0.4120      0.



In [32]:
LR_ur = -312.08
LR_r =  -313.81

LR = 2 * (LR_ur - LR_r)
LR

3.4600000000000364

In [33]:
q = len(ur_model.params) - len(r_model.params)
chi2_stat = chi2.ppf(0.95, q)

In [34]:
if LR > chi2_stat:
  print("Peject null hypothesis")
else:
  print("Do not reject null hypothesis")

Do not reject null hypothesis


Years are not significant.

## Testing genders

In [36]:
X = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'main_singers', 'backing_dancers', 'backing_singers','backing_instruments',
        'eng_lan', 'inc_eng_lan',
        'Male',  'Female',
        'Pop', 'Rock', 'Ballad', 'Traditional'
        ]]
y = df['is_final']


X = sm.add_constant(X)
ur_model = sm.Logit(y, X)
ur_model = ur_model.fit()

print(ur_model.summary())

Optimization terminated successfully.
         Current function value: 0.652287
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:               is_final   No. Observations:                  483
Model:                          Logit   Df Residuals:                      459
Method:                           MLE   Df Model:                           23
Date:                Sat, 27 Apr 2024   Pseudo R-squ.:                 0.04222
Time:                        18:06:43   Log-Likelihood:                -315.05
converged:                       True   LL-Null:                       -328.94
Covariance Type:            nonrobust   LLR p-value:                    0.2243
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   0.5520      1.219      0.453      0.651      -1.837       2.942
da

In [37]:
X = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'main_singers', 'backing_dancers', 'backing_singers','backing_instruments',
        'eng_lan', 'inc_eng_lan',
       
        'Pop', 'Rock', 'Ballad', 'Traditional'
        ]]
y = df['is_final']


X = sm.add_constant(X)
r_model = sm.Logit(y, X)
r_model = r_model.fit()

print(r_model.summary())

Optimization terminated successfully.
         Current function value: 0.654727
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:               is_final   No. Observations:                  483
Model:                          Logit   Df Residuals:                      461
Method:                           MLE   Df Model:                           21
Date:                Sat, 27 Apr 2024   Pseudo R-squ.:                 0.03864
Time:                        18:06:46   Log-Likelihood:                -316.23
converged:                       True   LL-Null:                       -328.94
Covariance Type:            nonrobust   LLR p-value:                    0.2294
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   0.8687      1.136      0.765      0.444      -1.358       3.095
da

In [38]:
LR_ur = -315.05
LR_r =   -316.23

LR = 2 * (LR_ur - LR_r)

q = len(ur_model.params) - len(r_model.params)
chi2_stat = chi2.ppf(0.95, q)

if LR > chi2_stat:
  print("Peject null hypothesis")
else:
  print("Do not reject null hypothesis")

Do not reject null hypothesis


Years are not significant

## Testing styles

In [39]:
X = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'main_singers', 'backing_dancers', 'backing_singers','backing_instruments',
        'eng_lan', 'inc_eng_lan',
        'Male',  'Female',
        'Pop', 'Rock', 'Ballad', 'Traditional'
        ]]
y = df['is_final']


X = sm.add_constant(X)
ur_model = sm.Logit(y, X)
ur_model = ur_model.fit()

print(ur_model.summary())

Optimization terminated successfully.
         Current function value: 0.652287
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:               is_final   No. Observations:                  483
Model:                          Logit   Df Residuals:                      459
Method:                           MLE   Df Model:                           23
Date:                Sat, 27 Apr 2024   Pseudo R-squ.:                 0.04222
Time:                        18:08:16   Log-Likelihood:                -315.05
converged:                       True   LL-Null:                       -328.94
Covariance Type:            nonrobust   LLR p-value:                    0.2243
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   0.5520      1.219      0.453      0.651      -1.837       2.942
da

In [40]:
X = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'main_singers', 'backing_dancers', 'backing_singers','backing_instruments',
        'eng_lan', 'inc_eng_lan',
        'Male',  'Female',
        ]]
y = df['is_final']


X = sm.add_constant(X)
r_model = sm.Logit(y, X)
r_model = r_model.fit()

print(r_model.summary())

Optimization terminated successfully.
         Current function value: 0.662336
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:               is_final   No. Observations:                  483
Model:                          Logit   Df Residuals:                      463
Method:                           MLE   Df Model:                           19
Date:                Sat, 27 Apr 2024   Pseudo R-squ.:                 0.02747
Time:                        18:08:28   Log-Likelihood:                -319.91
converged:                       True   LL-Null:                       -328.94
Covariance Type:            nonrobust   LLR p-value:                    0.5178
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   0.0130      1.098      0.012      0.991      -2.139       2.165
da

In [41]:
LR_ur =  -315.05
LR_r =    -319.91

LR = 2 * (LR_ur - LR_r)

q = len(ur_model.params) - len(r_model.params)
chi2_stat = chi2.ppf(0.95, q)

if LR > chi2_stat:
  print("Peject null hypothesis")
else:
  print("Do not reject null hypothesis")

Peject null hypothesis


Styles are significant