In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error , r2_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn import linear_model
import pickle

df = pd.read_csv("SongPopularity_test_reg.csv")

In [2]:
# Split the data into train and test sets
X_test = df.drop(['Popularity'],axis=1)
y_test = df['Popularity']

In [3]:
# Preprocessing the data
def preprocess_data_test(d):
    # Drop unnecessary columns
    d.drop(['Song', 'Album', 'Album Release Date', 'Artist Names', 'Spotify Link', 'Song Image', 'Spotify URI'], axis=1, inplace=True)
    
    categorical_columns = d.select_dtypes(include=['object']).columns.tolist()
    with open('encoder.pkl', 'rb') as f:
            encoder = pickle.load(f)
    
    # Apply one-hot encoding to the categorical columns
    one_hot_encoded = encoder.fit_transform(d[categorical_columns])

    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

    # Concatenate the one-hot encoded dataframe with the original dataframe
    df_encoded = pd.concat([d, one_hot_df], axis=1)

    # Drop the original categorical columns
    df_encoded = df_encoded.drop(categorical_columns, axis=1)

    return df_encoded

# Preprocess the test data
X_test = preprocess_data_test(X_test)

X_test

Unnamed: 0,Hot100 Ranking Year,Hot100 Rank,Song Length(ms),Acousticness,Danceability,Energy,Instrumentalness,Liveness,Loudness,Speechiness,...,"Artist(s) Genres_['viral pop', 'post-teen pop']","Artist(s) Genres_['vocal harmony group', 'deep adult standards']","Artist(s) Genres_['vocal harmony group', 'easy listening']","Artist(s) Genres_['vocal harmony group', 'swing', 'adult standards', 'torch song']",Artist(s) Genres_['vocal harmony group'],"Artist(s) Genres_['western swing', 'bluegrass gospel', 'country boogie', 'yodeling', 'country gospel']",Artist(s) Genres_['western swing'],"Artist(s) Genres_['yacht rock', 'soft rock', 'mellow gold', 'singer-songwriter', 'classic rock']",Artist(s) Genres_['yacht rock'],Artist(s) Genres_[]
0,2011,48,215693,0.01910,0.668,0.857,0.000007,0.0385,-2.944,0.0535,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2007,62,198207,0.00112,0.764,0.439,0.074800,0.2170,-9.475,0.0459,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1983,22,231346,0.01620,0.985,0.303,0.000024,0.0562,-15.505,0.3630,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1986,73,378946,0.31800,0.684,0.573,0.028000,0.3060,-12.300,0.0261,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2000,23,227706,0.48600,0.813,0.949,0.000010,0.1120,-2.563,0.0336,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683,2007,64,239906,0.03740,0.162,0.786,0.000000,0.1430,-2.353,0.0380,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
684,1969,59,161066,0.20200,0.526,0.520,0.000329,0.1560,-8.918,0.0296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
685,2008,37,338853,0.07100,0.375,0.862,0.000000,0.2110,-3.363,0.2550,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
686,2007,73,263920,0.03680,0.697,0.860,0.000000,0.0721,-5.181,0.4300,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
def feature_scaling_test(d):
    
    with open('scaler.pkl', 'rb') as f:
        scal = pickle.load(f)
    scaled_features = scal.fit_transform(d)
    df_scaled = pd.DataFrame(scaled_features, columns=d.columns)   
    d = df_scaled
    
    return d

# Apply feature scaling to the preprocessed data
X_test = feature_scaling_test(X_test)

# Display the first few rows of the scaled data
X_test

Unnamed: 0,Hot100 Ranking Year,Hot100 Rank,Song Length(ms),Acousticness,Danceability,Energy,Instrumentalness,Liveness,Loudness,Speechiness,...,"Artist(s) Genres_['viral pop', 'post-teen pop']","Artist(s) Genres_['vocal harmony group', 'deep adult standards']","Artist(s) Genres_['vocal harmony group', 'easy listening']","Artist(s) Genres_['vocal harmony group', 'swing', 'adult standards', 'torch song']",Artist(s) Genres_['vocal harmony group'],"Artist(s) Genres_['western swing', 'bluegrass gospel', 'country boogie', 'yodeling', 'country gospel']",Artist(s) Genres_['western swing'],"Artist(s) Genres_['yacht rock', 'soft rock', 'mellow gold', 'singer-songwriter', 'classic rock']",Artist(s) Genres_['yacht rock'],Artist(s) Genres_[]
0,1.169859,-0.009245,-0.148549,-0.954258,0.332556,1.233696,-0.266851,-0.907622,1.577283,-0.229736,...,-0.038152,-0.038152,-0.038152,-0.053995,-0.053995,-0.038152,-0.038152,-0.038152,-0.066178,-0.241788
1,0.973527,0.482756,-0.420390,-1.016388,0.939331,-0.751143,0.202420,0.222397,-0.164984,-0.329822,...,-0.038152,-0.038152,-0.038152,-0.053995,-0.053995,-0.038152,-0.038152,-0.038152,-0.066178,-0.241788
2,-0.204465,-0.922963,0.094796,-0.964279,2.336178,-1.396928,-0.266742,-0.795570,-1.773600,3.846133,...,-0.038152,-0.038152,-0.038152,-0.053995,-0.053995,-0.038152,-0.038152,-0.038152,-0.066178,-0.241788
3,-0.057216,0.869329,2.389421,0.078583,0.433685,-0.114855,-0.091214,0.785824,-0.918606,-0.590572,...,-0.038152,-0.038152,-0.038152,-0.053995,-0.053995,-0.038152,-0.038152,-0.038152,-0.066178,-0.241788
4,0.629946,-0.887820,0.038208,0.659102,1.249039,1.670550,-0.266832,-0.442320,1.678921,-0.491803,...,-0.038152,-0.038152,-0.038152,-0.053995,-0.053995,-0.038152,-0.038152,-0.038152,-0.066178,-0.241788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683,0.973527,0.553042,0.227872,-0.891023,-2.865655,0.896558,-0.266893,-0.246071,1.734943,-0.433858,...,-0.038152,-0.038152,-0.038152,-0.053995,-0.053995,-0.038152,-0.038152,-0.038152,-0.066178,-0.241788
684,-0.891627,0.377327,-0.997793,-0.322252,-0.564966,-0.366521,-0.264829,-0.163772,-0.016394,-0.544480,...,-0.038152,-0.038152,-0.038152,-0.053995,-0.053995,-0.038152,-0.038152,-0.038152,-0.066178,-0.241788
685,1.022610,-0.395818,1.766125,-0.774919,-1.519373,1.257438,-0.266893,0.184413,1.465506,2.423859,...,-0.038152,-0.038152,-0.038152,-0.053995,-0.053995,-0.038152,-0.038152,-0.038152,-0.066178,-0.241788
686,0.973527,0.869329,0.601200,-0.893096,0.515853,1.247941,-0.266893,-0.694913,0.980521,4.728470,...,-0.038152,-0.038152,-0.038152,-0.053995,-0.053995,-0.038152,-0.038152,-0.038152,-0.066178,-0.241788


In [5]:
#It is the selected features from feature selection while in traning phase
selected_features =['Hot100 Ranking Year', 'Hot100 Rank', 'Song Length(ms)', 'Acousticness',
       'Danceability', 'Energy', 'Instrumentalness', 'Liveness', 'Loudness',
       'Speechiness', 'Mode', 'Time Signature',
       """Artist(s) Genres_["man's orchestra"]""",
       """Artist(s) Genres_['adult standards', 'easy listening']""",
       """Artist(s) Genres_['deep adult standards']""",
       """Artist(s) Genres_['karaoke']""", """Artist(s) Genres_['pop', 'dance pop']""",
       """Artist(s) Genres_['pop']""",
       """Artist(s) Genres_['swing', 'vaudeville', 'deep adult standards', 'british dance band']""",
       'Artist(s) Genres_[]']
X_test = X_test[selected_features]
X_test

KeyError: '["Artist(s) Genres_[\'swing\', \'vaudeville\', \'deep adult standards\', \'british dance band\']"] not in index'

In [None]:
# Load the trained Linear Regression model
with open('linear_regression_model.pkl', 'rb') as f:
    model_lr = pickle.load(f)

# Load the trained Random Forest Regressor model
with open('random_forest_regressor_model.pkl', 'rb') as f:
    rf_regressor = pickle.load(f)


In [None]:

# Predict using the Linear Regression model
y_pred_lr = model_lr.predict(X_test)

# Predict using the Random Forest Regressor model
y_pred_rf = rf_regressor.predict(X_test)

# Calculate MSE and R2 score for Linear Regression model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Calculate MSE and R2 score for Random Forest Regressor model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Linear Regression Model:")
print("MSE:", mse_lr)
print("R2 Score:", r2_lr)

print("Random Forest Regressor Model:")
print("MSE:", mse_rf)
print("R2 Score:", r2_rf)