### Import important stuff

In [None]:
import pandas as pd
import numpy as np
import random

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

In [None]:
df = pd.read_csv("universal_top_spotify_songs.csv", delimiter = ",")

df.head()

### Cleaning Data

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
# Limiting decimal places in the 'instrumentalness' to 4 decimal places
df['instrumentalness'] = df['instrumentalness'].round(4)

df.head()

In [None]:
# Counting unique column 'name' and 'artists'
num_name = df['name'].nunique()
num_artist = df['artists'].nunique()

# Get 5 random rows from dataframe
random_rows = df.sample(n=5)

# Menampilkan hasil dalam kalimat
output = f"The total unique number of song names is {num_name}, and the total unique number of artists is {num_artist}."
print(output)

print("\nSongs:")
for index, row in random_rows.iterrows():
    song = row['name']
    artist = row['artists']
    print(f"'{song}' by {artist}")

In [None]:
df["country"].unique()

In [None]:
# replace na value with GBL
df["country"] = df["country"].fillna("GBL")

# drop na value
df = df.dropna()

In [None]:
df["country"].unique()

In [None]:
df.isnull().sum()

In [None]:
# drop duplicates value
df.drop_duplicates(inplace=True)

In [None]:
df.head()

In [None]:
df["instrumentalness"].unique()

In [None]:
# convert snapshot_date & album_release_date to date type
df['snapshot_date'] = pd.to_datetime(df['snapshot_date'], errors='coerce')
df['album_release_date'] = pd.to_datetime(df['album_release_date'], errors='coerce')

df.head()

In [None]:
df.info()

In [None]:
# Convert to list
df['artists_list'] = df['artists'].str.split(', ')

In [None]:
# Replace country code to name of country
country_mapping = {
    'GBL': 'Global',
    'ZA': 'South Africa',
    'VN': 'Vietnam',
    'VE': 'Venezuela',
    'UY': 'Uruguay',
    'US': 'United States',
    'UA': 'Ukraine',
    'TW': 'Taiwan',
    'TR': 'Turkey',
    'TH': 'Thailand',
    'SV': 'El Salvador',
    'SK': 'Slovakia',
    'SG': 'Singapore',
    'SE': 'Sweden',
    'SA': 'Saudi Arabia',
    'RO': 'Romania',
    'PY': 'Paraguay',
    'PT': 'Portugal',
    'PL': 'Poland',
    'PK': 'Pakistan',
    'PH': 'Philippines',
    'PE': 'Peru',
    'PA': 'Panama',
    'NZ': 'New Zealand',
    'NO': 'Norway',
    'NL': 'Netherlands',
    'NI': 'Nicaragua',
    'NG': 'Nigeria',
    'MY': 'Malaysia',
    'MX': 'Mexico',
    'MA': 'Morocco',
    'LV': 'Latvia',
    'LU': 'Luxembourg',
    'LT': 'Lithuania',
    'KZ': 'Kazakhstan',
    'KR': 'South Korea',
    'JP': 'Japan',
    'IT': 'Italy',
    'IS': 'Iceland',
    'IN': 'India',
    'IL': 'Israel',
    'IE': 'Ireland',
    'ID': 'Indonesia',
    'HU': 'Hungary',
    'HN': 'Honduras',
    'HK': 'Hong Kong',
    'GT': 'Guatemala',
    'GR': 'Greece',
    'FR': 'France',
    'FI': 'Finland',
    'ES': 'Spain',
    'EG': 'Egypt',
    'EE': 'Estonia',
    'EC': 'Ecuador',
    'DO': 'Dominican Republic',
    'DK': 'Denmark',
    'DE': 'Germany',
    'CZ': 'Czech Republic',
    'CR': 'Costa Rica',
    'CO': 'Colombia',
    'CL': 'Chile',
    'CH': 'Switzerland',
    'CA': 'Canada',
    'BY': 'Belarus',
    'BR': 'Brazil',
    'BO': 'Bolivia',
    'BG': 'Bulgaria',
    'BE': 'Belgium',
    'AU': 'Australia',
    'AT': 'Austria',
    'AR': 'Argentina',
    'AE': 'United Arab Emirates',
    'GB': 'United Kingdom'
}

df["country"] = df["country"].replace(country_mapping)

df["country"].unique()

In [None]:
df.columns

In [None]:
df.head()

In [None]:
# change boolean to another text
df['is_explicit'] = np.where(df['is_explicit'], 'explicit', 'non-explicit')

In [None]:
df["is_explicit"].value_counts()

In [None]:
df.to_csv("cleaned_top_spotify_song.csv", index=False)