# **Install and import libraries**

In [1]:
# Install AutoGluon.
# AutoGluon is used in AutoML.

!pip install -U pip
!pip install -U setuptools wheel

!pip install torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cpu

!pip install autogluon

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.2
Collecting setuptools
  Using cached setuptools-72.1.0-py3-none-any.whl.metadata (6.6 kB)
Collecting wheel
  Using cached wheel-0.44.0-py3-none-any.whl.metadata (2.3 kB)
Using cached setuptools-72.1.0-py3-none-any.whl (2.3 MB)
Using cached wheel-0.44.0-py3-none-any.whl (67 kB)
Installing collected packages: wheel, setuptools
  Attempting uninstall: wheel
    Found existing installation: wheel 0.43.0
    Uninstalling wheel-0.43.0:
      Successfully uninstalled wheel-0.43.0
  Attempting uninstall: setuptools
    Found existing installation: setuptools 71.0.4
    Uninstalling setuptools-71.0.4:
      Successfully uninstalled setuptools-71.0.4
[31mERROR: pip's d

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch==2.1.2
  Downloading https://download.pytorch.org/whl/cpu/torch-2.1.2%2Bcpu-cp310-cp310-linux_x86_64.whl (184.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.9/184.9 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.16.2
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.16.2%2Bcpu-cp310-cp310-linux_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchvision
  Attempting uninstall: torch
    Found existing installation: torch 2.3.1+cu121
    Uninstalling torch-2.3.1+cu121:
      Successfully uninstalled torch-2.3.1+cu121
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.18.1+cu121
    Uninstalling torchvision-0.18.1+cu121:
      Successfully uninstalled torchvision-0.18.1+cu121
[31mERR

In [1]:
# import langdetect
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=6e213c068d275511554cde792f52809334dd3dc4f83493b7145766b82d6bfdc7
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [1]:
# Data manipulation libraries
import pandas as pd  # Library for data manipulation and analysis
import numpy as np   # Library for numerical operations

# Utility libraries
from collections import Counter                 # Library for counting hashable objects
from tqdm import tqdm                           # Library for progress bars
from langdetect import detect, DetectorFactory  # Libraries for language detection
import re                                       # Library for regular expressions

# Set seed for language detection to ensure reproducibility
DetectorFactory.seed = 0

# AutoGluon library for automated machine learning
from autogluon.tabular import TabularPredictor  # TabularPredictor class for tabular data predictions

# **Reading Data**

In [2]:
# Load the training dataset from a CSV file
train = pd.read_csv('train.csv')

# Load the test dataset from a CSV file
test = pd.read_csv('test.csv')

# **Data Processing and feature engineering**

In [3]:
# Add a placeholder 'Class' column to the test dataset
test['Class'] = -1

# Concatenate the training and test datasets for unified data treatment
full = pd.concat([train, test], ignore_index=True)

# Reset the index of the concatenated DataFrame
full = full.reset_index()

# Drop the old index column
full = full.drop('index', axis=1)

# Print the shape of the concatenated DataFrame
print(full.shape)

(17996, 18)


In [4]:
# Preprocessing and feature extraction for the 'duration_in min/ms' column
dur   = full['duration_in min/ms']
track = []
clean = []

# Process each value in the 'duration_in min/ms' column
for i in dur:
    if i <= 100:
        clean.append(i)  # If the value is less than or equal to 100, keep it as is
        track.append(0)  # Append 0 to the 'track' list
    else:
        clean.append(i / 60000)  # Convert values greater than 100 from milliseconds to minutes
        track.append(1)          # Append 1 to the 'track' list

# Update the 'duration_in min/ms' column with processed values
full['duration_in min/ms'] = clean

# Add a new column 'new1' to indicate the duration type
full['new1'] = track

In [5]:
# Feature engineering functions

def count_special_character(string):
    """
    Count the number of special characters in a string (non-digit characters).

    Args:
    string (str): The input string.

    Returns:
    int: The count of special characters in the string.
    """
    special_char = 0
    for i in range(len(string)):
        ch = string[i]
        if not ch.isdigit():   # Check if the character is not a digit
            special_char += 1  # Increment count for special characters
    return special_char

def feature_engineering(df):
    """
    Add new features to the DataFrame for feature engineering.

    Args:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    None: The DataFrame is modified in place.
    """
    # New features
    print("Adding new features")

    # Length of 'Artist Name'
    df['new2'] = df['Artist Name'].apply(len)

    # Length of 'Track Name'
    df['new3'] = df['Track Name'].apply(len)

    # Number of words in 'Artist Name'
    df['new4'] = df['Artist Name'].str.split().str.len()

    # Number of words in 'Track Name'
    df['new5'] = df['Track Name'].str.split().str.len()

    # Number of uppercase letters in 'Artist Name'
    df['new6'] = df['Artist Name'].str.findall(r'[A-Z]').str.len()

    # Number of lowercase letters in 'Artist Name'
    df['new7'] = df['Artist Name'].str.findall(r'[a-z]').str.len()

    # Number of digits in 'Artist Name'
    df['new8'] = df['Artist Name'].str.findall(r'[0-9]').str.len()

    # Number of uppercase letters in 'Track Name'
    df['new9'] = df['Track Name'].str.findall(r'[A-Z]').str.len()

    # Number of lowercase letters in 'Track Name'
    df['new10'] = df['Track Name'].str.findall(r'[a-z]').str.len()

    # Number of digits in 'Track Name'
    df['new11'] = df['Track Name'].str.findall(r'[0-9]').str.len()

    # Count of other characters in 'Artist Name' (not uppercase, lowercase, or digits)
    df['new12'] = df['new2'] - (df['new6'] + df['new7'])

    # Count of other characters in 'Track Name' (not uppercase, lowercase, or digits)
    df['new13'] = df['new3'] - (df['new9'] + df['new10'])

# Apply feature engineering to the full DataFrame
feature_engineering(full)

Adding new features


In [6]:
# Import the regular expressions library
import re

def clean_text(text):
    """
    Clean up the text by converting it to lowercase.

    Args:
    text (str): The input text.

    Returns:
    str: The cleaned text in lowercase.
    """
    text = text.lower()  # Convert text to lowercase
    return text

# Apply the clean_text function to the 'Artist Name' column
full['Artist Name'] = full['Artist Name'].apply(clean_text)

# Apply the clean_text function to the 'Track Name' column
full['Track Name'] = full['Track Name'].apply(clean_text)

In [7]:
# Create a new feature by concatenating 'Artist Name' and 'Track Name'
full['truc'] = (full['Artist Name'] + full['Track Name'] +
                full['Artist Name'] + full['Track Name'] +
                full['Artist Name'] + full['Track Name'])

# Extract the concatenated text into a variable
txt = full['truc']

# Initialize a list to store detected languages
lang = []

# Detect language for each entry in the 'truc' column
for i in tqdm(txt):
    try:
        lang.append(detect(i))  # Detect language
    except:
        lang.append('err')  # Append 'err' if language detection fails

# Remove the 'truc' column as it's no longer needed
del full['truc']

# Add the detected language as a new column
full['lang'] = lang

100%|██████████| 17996/17996 [04:30<00:00, 66.46it/s]


In [8]:
# Split the full dataset into training and test datasets based on the 'Class' column

# Extract rows where 'Class' is not -1 for the training dataset
train2 = full[full['Class'] != -1]

# Extract rows where 'Class' is -1 for the test dataset
test2 = full[full['Class'] == -1]

In [9]:
# Define target column
target = 'Class'

# Drop the 'Id' column from train2 and test2 datasets for training
train2 = train2.drop(columns=['Id'])  # Remove 'Id' column from the training data
test2_ids = test2['Id']               # Save 'Id' column from the test data for future reference
test2 = test2.drop(columns=['Id'])    # Remove 'Id' column from the test data

# Train the TabularPredictor model
np.random.seed(42)

predictor = TabularPredictor(
    label       = 'Class',        # The target column to predict
    eval_metric = 'f1_macro'      # The evaluation metric to optimize
).fit(
    train_data = train2,           # The training data
    presets    = 'best_quality',  # Preset configurations for the best model quality
    auto_stack = True             # Automatically stack models for improved performance
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240807_135117"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       11.19 GB / 12.67 GB (88.3%)
Disk Space Avail:   67.07 GB / 107.72 GB (62.3%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation

# Prepare "submission.csv" file

In [10]:
# Make predictions on the test dataset using the trained model
test_predictions = predictor.predict(test2)

# Prepare the submission file by creating a DataFrame with 'Id' and predicted values
submission = pd.DataFrame({
    'Id': test2_ids,           # Include the 'Id' column for identification
    target: test_predictions   # Include the predicted values for the target column
})

# Save the submission DataFrame to a CSV file without the index
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Submission file created successfully!
