### Step 1: Applying CTGAN

In [4]:
import pandas as pd

In [9]:
%pip install ctgan

Note: you may need to restart the kernel to use updated packages.


In [10]:
# Import Data
df_data_before = pd.read_csv('../Data/data_cleaned.csv')

In [11]:
# from ctgan import CTGAN

# # Initialize the CTGAN model
# ctgan = CTGAN(epochs=200)

# # Fit the model on your data
# # Assuming 'data' is your DataFrame
# ctgan.fit(df_data_before) 

In [12]:
from ctgan import CTGAN

# Initialize the CTGAN model
ctgan = CTGAN(epochs=200)

# List the names of the columns that are categorical in your dataset
categorical_columns = [
    'Primary streaming service',  # Options like Spotify, Apple Music, etc.
    'Hours per day',
    'While working',              # Yes/No or similar binary option
    'Instrumentalist',            # Yes/No or similar binary option
    'Composer',                   # Yes/No or similar binary option
    'Fav genre',                  # Genres like Classical, Jazz, etc.
    'Exploratory',                # Yes/No or similar binary option
    'Foreign languages',          # Yes/No or similar binary option
    'Frequency [Classical]',      # Likely categorical frequencies like Often, Rarely, etc.
    'Frequency [Country]',        # -- do --
    'Frequency [EDM]',            # -- do --
    'Frequency [Folk]',           # -- do --
    'Frequency [Gospel]',         # -- do --
    'Frequency [Hip hop]',        # -- do --
    'Frequency [Jazz]',           # -- do --
    'Frequency [K pop]',          # -- do --
    'Frequency [Latin]',          # -- do --
    'Frequency [Lofi]',           # -- do --
    'Frequency [Metal]',          # -- do --
    'Frequency [Pop]',            # -- do --
    'Frequency [R&B]',            # -- do --
    'Frequency [Rap]',            # -- do --
    'Frequency [Rock]',           # -- do --
    'Frequency [Video game music]',# -- do --
    'Anxiety',
    'Depression',
    'Insomnia',
    'OCD',
    'Music effects'               # Effects like Improve, No effect, etc.
]


# Fit the model on your data
# Assuming 'data' is your DataFrame
ctgan.fit(df_data_before, categorical_columns) 

In [13]:
# Create synthetic data
df_data_after = ctgan.sample(500)

In [14]:
df_data_after['Age'] = df_data_after['Age'].round().astype(int)
# df_data_after['Hours per day'] = df_data_after['Hours per day'].round().astype(int)
df_data_after['BPM'] = df_data_after['BPM'].round().astype(int)

In [15]:
# Combine original data with synthetic data
expanded_data = pd.concat([df_data_before, df_data_after], ignore_index=True)

# # Optionally, save the expanded dataset to a CSV file
# expanded_data.to_csv('../Data/data_cleaned_expanded.csv', index=False)

### Step 2: Process Data

In [16]:
df_process = expanded_data.copy()

In [17]:
df_process.describe()

Unnamed: 0,Age,Hours per day,BPM,Anxiety,Depression,Insomnia,OCD
count,1111.0,1111.0,1111.0,1111.0,1111.0,1111.0,1111.0
mean,21.856886,3.526868,108.648065,5.742124,5.0018,3.776778,2.738974
std,11.042353,2.898416,36.725098,2.802889,2.991429,3.077913,2.87609
min,-4.0,0.0,4.0,0.0,0.0,0.0,0.0
25%,15.0,2.0,80.5,3.0,2.0,1.0,0.0
50%,19.0,3.0,107.0,6.0,5.0,3.0,2.0
75%,25.0,5.0,132.0,8.0,7.0,6.0,5.0
max,89.0,24.0,225.0,10.0,10.0,10.0,10.0


In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [19]:
# Normalize numerical data through Standard Scaling
stanard_scaler = StandardScaler()

numeric_columns = ['Age', 'Hours per day', 'BPM']
df_process[numeric_columns] = stanard_scaler.fit_transform(df_process[numeric_columns])

In [20]:
# Normalize numerical data through Min-Max Scaling
min_max_scaler = MinMaxScaler()

mental_columns = ['Anxiety', 'Depression', 'Insomnia', 'OCD']
df_process[mental_columns] = min_max_scaler.fit_transform(df_process[mental_columns])

In [21]:
# Encode categories into numerical value from 0-3
mapping = {'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Very frequently': 3}
genre_freq_columns = ['Frequency [Classical]', 'Frequency [Country]', 'Frequency [EDM]', 'Frequency [Folk]', 'Frequency [Gospel]', 'Frequency [Hip hop]', 'Frequency [Jazz]', 'Frequency [K pop]', 'Frequency [Latin]', 'Frequency [Lofi]', 'Frequency [Metal]', 'Frequency [Pop]', 'Frequency [R&B]', 'Frequency [Rap]', 'Frequency [Rock]', 'Frequency [Video game music]']

for column in genre_freq_columns:
    df_process[column] = df_process[column].replace(mapping)

In [22]:
# Encode binary categorical data
mapping = {'No': 0, 'Yes': 1}
binary_categorical_columns = ['While working', 'Instrumentalist', 'Composer', 'Exploratory', 'Foreign languages']

for column in binary_categorical_columns:
    df_process[column] = df_process[column].replace(mapping)

In [23]:
# One-hot encoding Categorical data
df_encodedCTGAN = pd.get_dummies(df_process, drop_first=False, dtype='int64')

In [24]:
# # Saved the pre-processed code
# df_encodedCTGAN.to_pickle('df_encodedCTGAN.pkl')

In [92]:
# df_encodedCTGAN.to_csv('../Data/encodedCTGAN.csv', index=False)

In [25]:
print("Number of rows:", df_encodedCTGAN.shape[0])

Number of rows: 1111
