In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import warnings
from numpy.linalg import matrix_rank, inv
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [None]:
%matplotlib inline
np.set_printoptions(precision=4)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
warnings.filterwarnings('ignore')

In [None]:
def pretty_print_linear(coefs, names=None, sort=False):
    if names is None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst, key=lambda x: -np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name) for coef, name in lst)

In [None]:
def plot_correlation(df, numerical_feature_columns, target, k):
    cols = df[numerical_feature_columns].corr().nlargest(k, target)[target].index
    cm = df[cols].corr()
    plt.figure(figsize=(10,6))
    return sns.heatmap(cm, annot=True, cmap = 'viridis')

In [None]:
def fit_ols(x, y):
    model = sm.OLS(y, x)
    results = model.fit()
    return results.summary()

In [None]:
def generate_vif(df, target):
    X_var = add_constant(df.loc[:, df.columns != target])
    return pd.Series(
        [variance_inflation_factor(X_var.values, i) for i in range(X_var.shape[1])],
        index=X_var.columns,
    )

In [None]:
def generate_x_y_numeric_target(df, target):
    numerical_feature_columns = list(df._get_numeric_data().columns)
    X = df.loc[:, df.columns != target].values
    Y = df.loc[:, df.columns == target].values
    return numerical_feature_columns, X, Y

In [None]:
df = pd.read_csv('DataNoDupXY.csv',encoding= 'unicode_escape')

In [None]:
target = 'AEU'
numeric_cols, X, Y = generate_x_y_numeric_target(df, target)
plot_correlation(df, numeric_cols, target, 29)

In [None]:
f = plt.figure(figsize=(19, 15))
plt.matshow(df.corr(), fignum=f.number)
plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=15, rotation=45)
plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=15)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 
  
# the independent variables set 
X = df[['AlbumCount','SpotifyPopularity','Duration','ChartWeeks','SpotifyFollowers','StreamsPercentAEU','StreamsToSalesAEU','ChartReleaseDiff','Valence','Danceability','Key','Loudness','Mode','Speechiness','Acousticness','Instrumentalness','Liveness','Tempo']]
y = df['AEU']

# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["Independent Variable"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))] 
  
print(vif_data)

In [None]:
print(fit_ols(X,Y))

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 
  
# the independent variables set 
X2 = df[['Energy','AlbumCount','Duration','ChartWeeks','WeightedFollowers','StreamsPercentAEU','ChartReleaseDiff','Speechiness','Acousticness','Instrumentalness']]
y2 = df['AEU']

# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["Independent Variable"] = X2.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X2.values, i)
                          for i in range(len(X2.columns))] 
  
print(vif_data)

In [None]:
print(fit_ols(X2,y2))

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 
  
# the independent variables set 
X3 = df[['Energy','AlbumCount','Duration','ChartWeeks','WeightedFollowers','StreamsPercentAEU','ChartReleaseDiff','Speechiness','Acousticness','Instrumentalness']]
y3 = df['AvgAEU']

print(fit_ols(X3,y3))