In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [None]:
# import csv into a dataframe
songs = pd.read_csv(r"ml-03-data-processing-songs-dataset.csv")

In [None]:
songs.info()

In [None]:
songs.head()

In [None]:
songs.hist(figsize = (15,10))

In [None]:
# replace commas in Length (Duration) column to convert column to numeric
songs["Length (Duration)"] = songs["Length (Duration)"].str.replace(",", "")
#change Length (Duration) column to numeric
songs["Length (Duration)"] = pd.to_numeric(songs["Length (Duration)"])

In [None]:
# fix string months values
def replace_month(month):
    month_dict = {"Jan": 1, "Dec": 12, "Sep": 9, "Apr": 4, "Jun": 6, "May": 5}
    if month in month_dict.keys():
        return month_dict[month]
    else:
        return month

songs["Month"] = songs["Month"].apply(lambda x: replace_month(x)).astype(int)

# change data type of Month column to int
songs.info()

In [None]:
# take out year column outliers
# replace 92.0 years with 1992, leave nan values
songs["Year"] = songs["Year"].apply(lambda x: 1900 + x if (x is not np.nan) and (x == 92.0) else x)
# test to make sure years were fixed
songs[songs["Year"] < 1900]

In [None]:
# count na values in each column
#songs.isna().sum()

In [None]:
# drop columns that have greater than 50% na values
drop_cols =[]
length = songs.shape[0]
for col in songs.columns:
    if songs[col].isna().sum() > length/2:
        drop_cols.append(col)
songs.drop(drop_cols, axis = 1, inplace=True)

In [None]:
# drop rows that have more than 50% na values
drop_rows = []
width = songs.shape[1]
for i in range(len(songs.index)):
    if songs.loc[i].isna().sum() > width/2:
        drop_rows.append(i)
songs.drop(drop_rows, inplace=True)

In [None]:
# for column in songs.columns:
#     if(pd.api.types.is_numeric_dtype(songs[column])):
#         mean = songs[column].mean()
#         std = songs[column].std()
#         new_df = songs[(songs[column] < (mean - 3 * std)) | (songs[column] > (mean + 3 * std))]
#         print("===========================================")
#         print(f"Column: {column}")
#         print(f"Mean: {mean}")
#         print(f"Lower Bound: {mean - 3 * std}, Upper bound: {mean + 3 * std}")
#         print(f"Outlier count: {new_df.shape[0]}")
#         plt.boxplot(new_df[column])
#         plt.show()
#         print("===========================================")

In [None]:
# create dataframe with imputed mean values for numeric columns

# separate numeric columns into new dataframe
int_cols = ['Index', 'Year', 'Month',
       'Beats Per Minute (BPM)', 'Energy', 'Loudness (dB)', 'Liveness',
       'Length (Duration)', 'Acousticness', 'Speechiness', 'Popularity']
int_df = songs[int_cols]

# initialize imputer and impute dataframe
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
int_array = imp.fit_transform(int_df)

# create dataframe from array with original columns
int_df_imp = pd.DataFrame(data = int_array, columns = int_cols)

In [None]:
# create dataframe with imputed mean values for categorical columns

# separate numeric columns into new dataframe
cat_cols = ['Title', 'Artist', 'Top Genre']
cat_df = songs[cat_cols]

# initialize imputer and impute dataframe
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cat_array = imp.fit_transform(cat_df)

# create dataframe from array with original columns
cat_df_imp = pd.DataFrame(data = cat_array, columns = cat_cols)

In [None]:
# concat int and cat dataframe back together to get imputed dataframe 
imputed_songs = pd.concat([cat_df_imp, int_df_imp], axis =1)

In [None]:
# create date column
imputed_songs["Date"] = imputed_songs["Year"].astype(str).apply(lambda x: x[0:4]) + "-" + imputed_songs["Month"].astype(str).apply(lambda x: x[:-2])
imputed_songs["Date"] = pd.to_datetime(imputed_songs["Date"])

# create age column
imputed_songs["Age"] = pd.to_datetime("today") - imputed_songs["Date"]
imputed_songs["Age"] = imputed_songs["Age"].dt.days

# drop other date related columns
imputed_songs.drop(["Date", "Year", "Month"], axis=1, inplace=True)

In [None]:
imputed_songs.info()

In [None]:
# create dummy variables for categorical columns
title_dummies = pd.get_dummies(imputed_songs["Title"], drop_first = True)
artist_dummies = pd.get_dummies(imputed_songs["Artist"], drop_first = True)
genre_dummies = pd.get_dummies(imputed_songs["Top Genre"], drop_first = True)

# concat dataframe back to imputed_songs 
songs_clean = pd.concat([imputed_songs, title_dummies, artist_dummies, genre_dummies], axis =1)
# drop categorical columns
songs_clean.drop(["Title", "Artist", "Top Genre", "Index"], axis = 1, inplace=True)

In [None]:
songs_clean.head()

In [None]:
# create train and test arrays

# select columns to model with
model_df = songs_clean.drop(["Popularity"], axis=1)

# select target column
target_col = songs_clean["Popularity"]

X_train, X_test, y_train, y_test = train_test_split(model_df, target_col, test_size=0.30, random_state=42)

In [None]:
X_train.describe()

In [None]:
X_test.describe()

In [None]:
y_train.describe()

In [None]:
y_test.describe()