# Setting the Baseline 🧵

> A baseline is the result of a very basic model/solution. You usually create a baseline and then try to do more complex solutions to get a better result. If you can get a better score than the baseline, that's good.

I started by dividing the data into training and validation (X - predictor variables, y - objective variable) and then used `StratifiedKFold`, which separates the data for cross validation while preserving the percentage of samples in each class.

Before that, we need to restore the DataFrame from the Dataset Division point.

In [1]:
# Install Python libraries
!pip install pandas numpy seaborn matplotlib scikit-learn



In [2]:
# Import Python libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as mpatches
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# Import the dataset into a DataFrame
df = pd.read_csv("asset/dataset.csv")
# Remove duplicates
df = df.drop_duplicates()
# Remove null values
df = df.dropna()


# Function to strip whitespaces from object columns
def strip_whitespaces(x):
    if isinstance(x, str):
        return x.strip()
    else:
        return x


# Apply the function to all object columns
df = df.applymap(strip_whitespaces)
# Create popularity classs
# Selecting rows where 'popularity' is greater than or equal to 80
df[df["popularity"] >= 80]

# Defining conditions for the 'pop_class' column
conditionlist = [(df["popularity"] >= 80), (df["popularity"] < 80)]

# Assigning values based on conditions
choicelist = [1, 0]
df["pop_class"] = np.select(conditionlist, choicelist, default="Not Specified")

# Converting the 'pop_class' column to integer type
df["pop_class"] = df["pop_class"].astype(int)

# Remove useless columns
df = df.drop(columns=["popularity", "explicit"])

# Keep only quantitative columns that are important for the model

df_quantitative = df
cols_to_drop = []

for column in df:
    if df[column].dtype == "object":
        cols_to_drop.append(column)

df_quantitative = df.drop(columns=cols_to_drop)

df_quantitative.info()

# Normalizing the data, bringing it to the same scale
df_quantitative_nm = (df_quantitative - df_quantitative.min()) / (
    df_quantitative.max() - df_quantitative.min()
)

## Split the dataset for Training 🏋️‍♂️ and Testing ✔️
df_train, df_test = train_test_split(
    df_quantitative_nm, test_size=0.2, random_state=42, shuffle=True
)

# Visualizing the proportions of the target variable from the Training table
df_train["pop_class"].value_counts(normalize=True)

# Visualizing the proportions of the target variable from the Testing table
df_test["pop_class"].value_counts(normalize=True)

## Divide in tables `x` and `y`
X = df_train.drop("pop_class", axis=1)
y = df_train.pop_class

## Separate the data by maintaining the percentage of samples in each class 🔃
StratifKfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Loop to split in Training 🏋️‍♂️ and Validation ✔️ tables
for train_index, val_index in StratifKfold.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
## Checking the proportions of class 1 in the division 👍
print(f"Dimensions: {X_train.shape, X_val.shape, y_train.shape, y_val.shape}\n")
print(
    f"Proportion of df_train for class=1: {round(len(df_train[df_train.pop_class==1]) / df_train.shape[0], 4)}\n"
)
print(
    f"Proportion of X_train for class=1: {round(len(y_train[y_train==1]) / X_train.shape[0], 4)}"
)
print(
    f"Proportion of X_val for class=1: {round(len(y_val[y_val==1]) / X_val.shape[0], 4)}"
)

  df = df.applymap(strip_whitespaces)


<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        113999 non-null  int64  
 1   duration_ms       113999 non-null  int64  
 2   danceability      113999 non-null  float64
 3   energy            113999 non-null  float64
 4   key               113999 non-null  int64  
 5   loudness          113999 non-null  float64
 6   mode              113999 non-null  int64  
 7   speechiness       113999 non-null  float64
 8   acousticness      113999 non-null  float64
 9   instrumentalness  113999 non-null  float64
 10  liveness          113999 non-null  float64
 11  valence           113999 non-null  float64
 12  tempo             113999 non-null  float64
 13  time_signature    113999 non-null  int64  
 14  pop_class         113999 non-null  int32  
dtypes: float64(9), int32(1), int64(5)
memory usage: 13.5 MB
Dimensions: ((729

In [3]:
# Instantiating the model
logReg = LogisticRegression()

# Training the model
logReg.fit(X_train, y_train)

# Predicting on training data
y_pred_base_train = logReg.predict(X_train)

# Predicting on validation data
y_pred_base_val = logReg.predict(X_val)

NameError: name 'LogisticRegression' is not defined