In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv('/kaggle/input/cs9856-spotify-classification-problem-2024/CS98XClassificationTrain.csv')

In [None]:
train_data.describe()

In [None]:
train_data.info()

In [None]:
train_data['top genre'].fillna("adult standards", inplace=True)

In [None]:
print(train_data.info())

In [None]:
# dropping object row
train_data = train_data.drop(['title','Id','artist'],axis=1)


In [None]:
# correlation_matrix = train_data.corr()
# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
# plt.title("Correlation Heatmap of Numeric Features")
# plt.show()

plot the correlation matrix and visualize it as a heatmap for the numeric columns in your dataset

In [None]:
train_data=train_data.dropna()
print(train_data.isnull().sum())

# Label Encoder
Label Encoder is being used to convert non-interger values to integer values. Label encoding involes assigning a unique integer into each catogory in the variable.
Each unique genre in the 'top genre' column is assigned a unique integer label.

In [None]:
label_encoder = LabelEncoder()
train_data["top genre"] = label_encoder.fit_transform(train_data["top genre"])

# Split data into features and target
X = train_data.drop(columns=["top genre"])
y = train_data["top genre"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization
Standard scalar is used for standardization as it ensures each feature has a mean of 0 and standard deviation of 1, making them comparable and preventing features with larger scales from dominating the model's learning process.
This scaler is first fitted into the training data (X_train) and then applied to both the training and testing data to avoid data leakage.

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Hyperparameter tuning
It is a crucial step in the machine learning workflow where you search for the optimal hyperparameters for your model. They are parameteres that are set prior to the training process and are not learned from the data.

To perform hyperparameter tuning, the code can be enhanced with techniques such as **GridSearchCV**, which exhaustively searches over a specific parameter grid, evaluvating the model performance using cross-validation.


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score




rf_classifier = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)




y_pred = rf_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier


base_estimator = DecisionTreeClassifier(random_state=42)


bagging_classifier = BaggingClassifier(estimator=base_estimator, n_estimators=30, random_state=42)
bagging_classifier.fit(X_train_scaled, y_train)


y_pred_bagging = bagging_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred_bagging)
print("Accuracy:", accuracy)


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialize base estimator (e.g., Decision Tree)
base_estimator = DecisionTreeClassifier(max_depth=1, random_state=42)

# Initialize and train AdaBoost Classifier
adaboost_classifier = AdaBoostClassifier(estimator=base_estimator, n_estimators=100, random_state=42)
adaboost_classifier.fit(X_train_scaled, y_train)

# Make predictions
y_pred_adaboost = adaboost_classifier.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred_adaboost)
print("Accuracy:", accuracy)


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize SVM classifier
svm_classifier = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

# Train the SVM classifier
svm_classifier.fit(X_train_scaled, y_train)

# Make predictions
y_pred_svm = svm_classifier.predict(X_test_scaled)

# Calculate accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Accuracy:", accuracy_svm)

In [None]:
# best_params = grid_search.best_params_
# print("Best Parameters:", best_params)

In [None]:
from sklearn.metrics import accuracy_score


# Make predictions
y_pred = rf_classifier.predict(X_test_scaled)


# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
test_data =  pd.read_csv('/kaggle/input/cs9856-spotify-classification-problem-2024/CS98XClassificationTest.csv')
test_data.head()

In [None]:
test_data = test_data.drop(['title','Id','artist'],axis=1)

In [None]:
test_data.info()

In [None]:
scaler = StandardScaler()
test_data_scaled= scaler.fit_transform(test_data)
test_data_scaled= scaler.transform(test_data)
y_pred_sub = rf_classifier.predict(test_data_scaled)

# Iterative Refinement
It is a fundamental aspect of the machine learning workflow , especially in classification problems, where the goal is to classify data points into predefined catogories or classes. It is a dynamic process that involves continous improvement and adjustment of various components within the machine learning pipeline.

In [None]:
genre_labels = label_encoder.inverse_transform(y_pred_sub)

print(genre_labels)

In [None]:
test_data =  pd.read_csv('/kaggle/input/cs9856-spotify-classification-problem-2024/CS98XClassificationTest.csv')

In [None]:
submission_df=pd.DataFrame({'Id':test_data['Id'],'top genre':genre_labels})
submission_df.to_csv('submission.csv',index=False)