# **Install and import libraries**

In [None]:
# Install AutoGluon.
# AutoGluon is used in AutoML.

!pip install -U pip
!pip install -U setuptools wheel

!pip install torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cpu

!pip install autogluon

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.2
Collecting setuptools
  Using cached setuptools-72.1.0-py3-none-any.whl.metadata (6.6 kB)
Collecting wheel
  Using cached wheel-0.44.0-py3-none-any.whl.metadata (2.3 kB)
Using cached setuptools-72.1.0-py3-none-any.whl (2.3 MB)
Using cached wheel-0.44.0-py3-none-any.whl (67 kB)
Installing collected packages: wheel, setuptools
  Attempting uninstall: wheel
    Found existing installation: wheel 0.43.0
    Uninstalling wheel-0.43.0:
      Successfully uninstalled wheel-0.43.0
  Attempting uninstall: setuptools
    Found existing installation: setuptools 71.0.4
    Uninstalling setuptools-71.0.4:
      Successfully uninstalled setuptools-71.0.4
[31mERROR: pip's d

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch==2.1.2
  Downloading https://download.pytorch.org/whl/cpu/torch-2.1.2%2Bcpu-cp310-cp310-linux_x86_64.whl (184.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.9/184.9 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.16.2
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.16.2%2Bcpu-cp310-cp310-linux_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchvision
  Attempting uninstall: torch
    Found existing installation: torch 2.3.1+cu121
    Uninstalling torch-2.3.1+cu121:
      Successfully uninstalled torch-2.3.1+cu121
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.18.1+cu121
    Uninstalling torchvision-0.18.1+cu121:
      Successfully uninstalled torchvision-0.18.1+cu121
[31mERR

In [None]:
# import langdetect
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=43c762ff9f3589227a6f16e704ed3187b0f71c2a24932d4b285f59eb63159701
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [1]:
# Data manipulation libraries
import pandas as pd  # Library for data manipulation and analysis
import numpy as np   # Library for numerical operations

# Utility libraries
from collections import Counter                 # Library for counting hashable objects
from tqdm import tqdm                           # Library for progress bars
from langdetect import detect, DetectorFactory  # Libraries for language detection
import re                                       # Library for regular expressions

# Set seed for language detection to ensure reproducibility
DetectorFactory.seed = 0

# Scikit-learn libraries for model evaluation and data splitting
from sklearn.model_selection import StratifiedShuffleSplit # Class for stratified splitting of data into training and test sets
from sklearn.model_selection import train_test_split       # Function for splitting data into training and test sets
from sklearn.metrics import f1_score                       # Function for calculating the F1 score

# AutoGluon library for automated machine learning
from autogluon.tabular import TabularPredictor             # TabularPredictor class for tabular data predictions

# **Reading data**

In [2]:
# Load the dataset from a CSV file
df = pd.read_csv('train.csv')

# Drop the 'Id' column as it is not needed for training
df.drop('Id', axis=1, inplace=True)

# Split the data into training and test sets
# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Initialize the StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Split the data
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Combine the splits back into DataFrames
train = pd.concat([X_train, y_train], axis=1)
test  = pd.concat([X_test, y_test], axis=1)

# **Data processing and feature engineering**

In [3]:
def preprocess_duration(df):
    """
    Preprocess the 'duration_in min/ms' column in the dataframe.

    Args:
    df (pd.DataFrame): The input dataframe containing the 'duration_in min/ms' column.

    Returns:
    pd.DataFrame: The dataframe with the processed 'duration_in min/ms' column and a new 'new1' column.
    """
    dur   = df['duration_in min/ms']
    track = []
    clean = []

    # Process each value in the 'duration_in min/ms' column
    for i in dur:
        if i <= 100:
            clean.append(i)  # If the value is less than or equal to 100, keep it as is
            track.append(0)  # Append 0 to the 'track' list
        else:
            clean.append(i / 60000)  # Convert values greater than 100 from ms to minutes
            track.append(1)          # Append 1 to the 'track' list

    # Update the dataframe with the processed values
    df['duration_in min/ms'] = clean
    df['new1'] = track

    return df

# Apply the preprocessing function to the training and test datasets
train = preprocess_duration(train)
test  = preprocess_duration(test)

In [4]:
# Feature engineering functions

def count_special_character(string):
    """
    Count the number of special characters in a string (non-digit characters).

    Args:
    string (str): The input string.

    Returns:
    int: The count of special characters in the string.
    """
    special_char = 0
    for i in range(len(string)):
        ch = string[i]
        if not ch.isdigit():   # Check if the character is not a digit
            special_char += 1  # Increment count for special characters
    return special_char

def feature_engineering(df):
    """
    Add new features to the DataFrame for feature engineering.

    Args:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    None: The DataFrame is modified in place.
    """
    # New features
    print("Adding new features")

    # Length of 'Artist Name'
    df['new2'] = df['Artist Name'].apply(len)

    # Length of 'Track Name'
    df['new3'] = df['Track Name'].apply(len)

    # Number of words in 'Artist Name'
    df['new4'] = df['Artist Name'].str.split().str.len()

    # Number of words in 'Track Name'
    df['new5'] = df['Track Name'].str.split().str.len()

    # Number of uppercase letters in 'Artist Name'
    df['new6'] = df['Artist Name'].str.findall(r'[A-Z]').str.len()

    # Number of lowercase letters in 'Artist Name'
    df['new7'] = df['Artist Name'].str.findall(r'[a-z]').str.len()

    # Number of digits in 'Artist Name'
    df['new8'] = df['Artist Name'].str.findall(r'[0-9]').str.len()

    # Number of uppercase letters in 'Track Name'
    df['new9'] = df['Track Name'].str.findall(r'[A-Z]').str.len()

    # Number of lowercase letters in 'Track Name'
    df['new10'] = df['Track Name'].str.findall(r'[a-z]').str.len()

    # Number of digits in 'Track Name'
    df['new11'] = df['Track Name'].str.findall(r'[0-9]').str.len()

    # Count of other characters in 'Artist Name' (not uppercase, lowercase, or digits)
    df['new12'] = df['new2'] - (df['new6'] + df['new7'])

    # Count of other characters in 'Track Name' (not uppercase, lowercase, or digits)
    df['new13'] = df['new3'] - (df['new9'] + df['new10'])



# Apply feature engineering to the training and test datasets
feature_engineering(train)
feature_engineering(test)

Adding new features
Adding new features


In [5]:
# Function to clean up text using regex
def clean_text(text):
    """
    Clean text by converting it to lowercase.

    Args:
    text (str): The input text.

    Returns:
    str: The cleaned text.
    """
    text = text.lower()  # Convert text to lowercase
    return text

# Apply the clean_text function to the 'Artist Name' and 'Track Name' columns in the training dataset
train['Artist Name'] = train['Artist Name'].apply(clean_text)
train['Track Name']  = train['Track Name'].apply(clean_text)

# Apply the clean_text function to the 'Artist Name' and 'Track Name' columns in the test dataset
test['Artist Name'] = test['Artist Name'].apply(clean_text)
test['Track Name']  = test['Track Name'].apply(clean_text)

In [6]:
# Concatenate 'Artist Name' and 'Track Name' multiple times to create 'truc' column
train['truc'] = train['Artist Name'] + train['Track Name'] + train['Artist Name'] + train['Track Name'] + train['Artist Name'] + train['Track Name']
test['truc']  = test['Artist Name']  + test['Track Name']  + test['Artist Name']  + test['Track Name']  + test['Artist Name']  + test['Track Name']

# Detect language for the 'truc' column in the train dataset
train_lang = []
for i in tqdm(train['truc'], desc="Detecting language for train dataset"):
    try:
        train_lang.append(detect(i))
    except:
        train_lang.append('err')

# Detect language for the 'truc' column in the test dataset
test_lang = []
for i in tqdm(test['truc'], desc="Detecting language for test dataset"):
    try:
        test_lang.append(detect(i))
    except:
        test_lang.append('err')

# Add detected language as a new column
train['lang'] = train_lang
test['lang']  = test_lang

# Remove the 'truc' column as it's no longer needed
del train['truc']
del test['truc']

Detecting language for train dataset: 100%|██████████| 11516/11516 [02:44<00:00, 69.93it/s]
Detecting language for test dataset: 100%|██████████| 2880/2880 [00:41<00:00, 68.90it/s]


# **Train model**

In [7]:
# Train the TabularPredictor model
np.random.seed(42)

predictor = TabularPredictor(
    label       = 'Class',        # The target column to predict
    eval_metric = 'f1_macro'      # The evaluation metric to optimize
).fit(
    train_data = train,           # The training data
    presets    = 'best_quality',  # Preset configurations for the best model quality
    auto_stack = True             # Automatically stack models for improved performance
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240807_111547"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       10.66 GB / 12.67 GB (84.1%)
Disk Space Avail:   68.59 GB / 107.72 GB (63.7%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation

# **Evaluate Model**

In [8]:
# Define the target column
target = 'Class'

# Make predictions on the test set using the trained predictor
test_predictions = predictor.predict(test)

# Calculate the F1 macro score
f1_macro_score = f1_score(test[target], test_predictions, average='macro')

# Print the F1 macro score
print(f"F1 Macro Score: {f1_macro_score}")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



F1 Macro Score: 0.7296209643471957
