In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#Importing Dependencies

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import LinearSVR
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler

#Loading the datasets

We are using the pandas library to import the train and test files into our database. We will primaririly be using the Training dataset and be storing it as `train_df` and the Test dataset will be used for creating a prediction for popularity and will be stored as `test_df`.

In [None]:
train_df = pd.read_csv("../input/cs9856-spotify-regression-problem-2024/CS98XRegressionTrain.csv") 
test_df = pd.read_csv("../input/cs9856-spotify-regression-problem-2024/CS98XRegressionTest.csv") 

#Gathering Information about the data

We have now entered the process of `Exploratory Data Analysis(EDA)` where we will be finding the various trends in the data to choose an effective model for machine learning. 

`train_df.info()` gives us a basic idea about whether a columnn is null or not, and also lets us know what the categorical and numerical datas are. In this particular case we can observe that we have some `Null` data in the `top genre` column and title, artist and top genre have categorical data type is object.

In [None]:
train_df.info()
train_df['top genre'].value_counts()

#Visualizing the Data

Checking the skewed data and variance

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
train_df.hist(bins=50, figsize=(12,7))
plt.show()

#Handing Null Values

To handle the null values we have used `dropna()`, which removes the rows which have null values

In [None]:
train_df = train_df.dropna()

#Separate out the features and prediction column

Now we are beginning to Create our data which needs to be in a format which doesnt contain the columns that we dont actually need for out model. These columns include `Id`, `Title` and `Popularity` which needs to be removed so we drop these columns and we store it out in a separate variable called X. We will now also create a  separate variable called `y` which will contain our dataframe but only the popularity column out of it.

In [None]:
X = train_df.drop(['Id', 'title', 'pop'], axis=1)
y = train_df['pop']

#Creating Column Transformers (PreProcessor)

For the next part of our project we are using `ColumnTransformer` to create a pre-processor. We use ColumnTransformer because we want to transform rows for both numerical and categorical data.

To explain the code a bit, we can divide it on on a high level into a numerical and categorical transformer, where the categorical transformer uses `OneHotEncoder`,  which converts the categorical data into separate columns for each of the data entry and then which ever row fits that value is assigned a 1 in place of it. Basically all the unique rows for the categorical data gets converted into individual columns where the value is either 1 or 0(1 if present and 0 if not).

The numerical transformer completes the scaling process for our numerical data. This helps in providing a general range for our data. It doesn't reduce the outliers that we have but rather tries to bring it all in a working legible range. We are using `MinMaxScaler` for our operation. We chose this to improve accuracy as `StandardScaler` was not meeting the expectations for our EDA.



In [None]:
numeric_features = ['year', 'bpm', 'nrgy', 'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch']
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

categorical_features = ['top genre', 'artist']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

#Creating the Model using Pipeline with the Preprocessor and Regressor

Now as we reach the final part of our model to fetch the Root Mean Squared Error `RMSE` value,  we will create our ML model.

We have chosen the `GradientBoostingRegressor` we are using it to maximize our accuracy as it uses the ensemble method to continuously and regressivly generate the predictions of the scores of popularity for us.

We ahve calculated the RMSE value using the mean_squared_error function from the `sklearn.metrics` library.

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())
])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)

rmse

# mse = mean_squared_error(y_val, y_pred)
# rmse = np.sqrt(mse)

# rmse

#Predicting Popularity In Test Data

Now we will be using the test data do define the features that we will be using in the test dataset. We will now fit the data for test dataset into our model and we will get a prediction list for data which will be saved in the `predicted_pop_df` which contains two columns where one is the `Id` and the other one is the `pop` which is our predicted popularity from our model.

In [None]:
X_test = test_df.drop(['Id', 'title'], axis=1)
predicted_pop = model.predict(X_test)

predicted_pop_df = pd.DataFrame({
    'Id': test_df['Id'],
    'pop': predicted_pop
})

predicted_pop_df.head(), predicted_pop_df.shape

#Generating Submission CSV

A simple code to genenrate a `.csv` file for the external submission.

In [None]:
predicted_pop_df.to_csv("submission.csv",index=False)