# Dependencies

In [42]:
# Import Dependencies
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# Matplotlib visualization
import matplotlib.pyplot as plt
%matplotlib inline

# Splitting data into training and testing
from sklearn.model_selection import train_test_split

# Preprocessing library to encode data
from sklearn import preprocessing

# Machine Learning Model
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [3]:
# Import Train and Test Data
train_data = pd.read_csv("train_final_clean_df.csv")
test_data = pd.read_csv("test_final_clean_df.csv")

In [4]:
train_data.head(2)

Unnamed: 0,source_system_tab,source_screen_name,source_type,song_length,genre_ids,artist_name,composer,language,city,age,gender,registered_via,target,age_group
0,explore,Explore,online-playlist,206471.0,359,Bastille,Dan Smith| Mark Crew,52.0,1,0,Unknown,7,1,0
1,my library,Local playlist more,local-playlist,284584.0,1259,Various Artists,Unknown,52.0,13,24,female,9,1,1


In [5]:
test_data.head(2)

Unnamed: 0,source_system_tab,source_screen_name,source_type,song_length,genre_ids,artist_name,composer,language,city,age,gender,registered_via,age_group
0,my library,Local playlist more,local-library,224130.0,458,梁文音 (Rachel Liang),Qi Zheng Zhang,3.0,1,0,Unknown,7,0
1,my library,Local playlist more,local-library,320470.0,465,林俊傑 (JJ Lin),林俊傑,3.0,1,0,Unknown,7,0


# Encode label columns

In [20]:
train_encoded_data = train_data.copy()
test_encoded_data = test_data.copy()

# Replace Unseen items in test data
test_encoded_data['source_screen_name'] = test_encoded_data['source_screen_name'].replace('People local','Unknown')
test_encoded_data['source_screen_name'] = test_encoded_data['source_screen_name'].replace('People global','Unknown')
test_encoded_data['genre_ids'] = test_encoded_data['genre_ids'].replace(1089,0)
test_encoded_data['genre_ids'] = test_encoded_data['genre_ids'].replace(166,0)
test_encoded_data['genre_ids'] = test_encoded_data['genre_ids'].replace(765,0)
test_encoded_data['genre_ids'] = test_encoded_data['genre_ids'].replace(1061,0)
test_encoded_data['genre_ids'] = test_encoded_data['genre_ids'].replace(303,0)
test_encoded_data['genre_ids'] = test_encoded_data['genre_ids'].replace(2045,0)


# Select the categorical columns
categorical_columns = ['source_system_tab', 'source_screen_name', 'source_type', 'genre_ids', 'gender']

i = 0
le = []

for column in categorical_columns:
    le.append(preprocessing.LabelEncoder())
    fit = le[i].fit(train_encoded_data[column])
    transform = le[i].transform(train_encoded_data[column])
    train_encoded_data[column] = transform
    i += 1
    
i = 0

for column in categorical_columns:
    transform = le[i].transform(test_encoded_data[column])
    test_encoded_data[column] = transform
    i += 1

In [24]:
train_encoded_data = train_encoded_data[["source_system_tab", "source_screen_name", "source_type",
                                         "song_length", "genre_ids", "language", "city", "age_group",
                                         "gender", "registered_via", "target"]]
train_encoded_data.head(2)

Unnamed: 0,source_system_tab,source_screen_name,source_type,song_length,genre_ids,language,city,age_group,gender,registered_via,target
0,2,7,7,206471.0,21,52.0,1,0,0,7,1
1,4,8,5,284584.0,105,52.0,13,1,1,9,1


In [25]:
test_encoded_data = test_encoded_data[["source_system_tab", "source_screen_name", "source_type",
                                         "song_length", "genre_ids", "language", "city", "age_group",
                                         "gender", "registered_via"]]
test_encoded_data.head(2)

Unnamed: 0,source_system_tab,source_screen_name,source_type,song_length,genre_ids,language,city,age_group,gender,registered_via
0,4,8,4,224130.0,33,3.0,1,0,0,7
1,4,8,4,320470.0,34,3.0,1,0,0,7


In [26]:
# Create features and targets datasets
features = train_encoded_data.drop("target", axis = 1)
targets = pd.DataFrame(train_encoded_data['target'])

In [27]:
features.head(2)

Unnamed: 0,source_system_tab,source_screen_name,source_type,song_length,genre_ids,language,city,age_group,gender,registered_via
0,2,7,7,206471.0,21,52.0,1,0,0,7
1,4,8,5,284584.0,105,52.0,13,1,1,9


In [28]:
targets.head(2)

Unnamed: 0,target
0,1
1,1


# Split Data in Trainning and Testing datasets

In [29]:
# Split into 70% training and 30% testing set
X, X_test, y, y_test = train_test_split(features, targets, test_size = 0.3, random_state = 42)

print(X.shape)
print(X_test.shape)
print(y.shape)
print(y_test.shape)

(5164192, 10)
(2213226, 10)
(5164192, 1)
(2213226, 1)


# Scale data

In [31]:
# Create the scaler object with a range of 0-1
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))

# Fit on the training data
scaler.fit(X)

# Transform both the training and testing data
X = scaler.transform(X)
X_test = scaler.transform(X_test)

In [43]:
# Convert labels to one-hot-encoding
y = to_categorical(y)
y_test = to_categorical(y_test)

# Apply Deep Learning Model

In [44]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=10))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [45]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [46]:
model.fit(
    X,
    y,
    epochs=10,
    shuffle=True,
    verbose=2
)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
 - 238s - loss: 0.6546 - acc: 0.6242
Epoch 2/10
 - 234s - loss: 0.6523 - acc: 0.6260
Epoch 3/10
 - 238s - loss: 0.6516 - acc: 0.6266
Epoch 4/10
 - 238s - loss: 0.6511 - acc: 0.6269
Epoch 5/10
 - 229s - loss: 0.6507 - acc: 0.6273
Epoch 6/10
 - 223s - loss: 0.6505 - acc: 0.6273
Epoch 7/10
 - 222s - loss: 0.6503 - acc: 0.6275
Epoch 8/10
 - 224s - loss: 0.6500 - acc: 0.6277
Epoch 9/10
 - 223s - loss: 0.6499 - acc: 0.6278
Epoch 10/10
 - 223s - loss: 0.6498 - acc: 0.6279


<tensorflow.python.keras.callbacks.History at 0x1a3048e780>

## Quantify our Trained Model

In [47]:
model_loss, model_accuracy = model.evaluate(
    X_test, y_test, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

 - 45s - loss: 0.6490 - acc: 0.6289
Normal Neural Network - Loss: 0.6490448119184432, Accuracy: 0.628908634185791


## Make Predictions

In [49]:
encoded_predictions = model.predict_classes(X_test[:5])

In [50]:
print(f"Predicted classes: {encoded_predictions}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: [0 0 0 1 0]
Actual Labels: [array([1., 0.], dtype=float32), array([0., 1.], dtype=float32), array([1., 0.], dtype=float32), array([1., 0.], dtype=float32), array([0., 1.], dtype=float32)]
