In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from keras import models
from keras import layers
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB


# load dataset 
pathLyrics = 'lyrics-data.csv'
pathArtists = 'artists-data.csv'


dataframe = pd.read_csv(pathLyrics)
dataframeArtists = pd.read_csv(pathArtists)

#Data preparation

In [None]:
# filter Idiom for olny english lyrics
dataframe = dataframe[dataframe['Idiom'] == 'ENGLISH']

# normalize lycris to lowercase  
dataframe['Lyric'] = dataframe['Lyric'].apply(lambda lyric: lyric.lower())

# Zwei csv werden gemerged
mergeData=pd.merge(dataframe, dataframeArtists, how='inner', left_on='ALink', right_on='Link')

#Filtern der Daten. Nur Lyric und Genre für das Dataset notwendig
rawData= mergeData[['Lyric', 'Genre']]

# Es werden die Spalten gefiltert, die Noten enthalten
rawData = rawData[~rawData['Lyric'].str.contains('----')] 

#Filtert alle Zeichen bis auf Buchstaben und Zahlen heraus
rawData['Lyric']=rawData['Lyric'].str.replace('[^a-zA-Z0-9]', ' ')
rawData = rawData.drop_duplicates(subset='Lyric')
#train, test = train_test_split(rawData, test_size=0.2)

In [None]:
from sklearn.utils import shuffle

dataframe = dataframe[dataframe['Idiom'] == 'ENGLISH']
dataframe['Lyric'] = dataframe['Lyric'].apply(lambda lyric: lyric.lower())

patternDel = ["---", "instrumental","==="]
for patt in patternDel:
    filter = dataframe['Lyric'].str.contains(patt)
    dataframe = dataframe[~filter]


mergeData=pd.merge(dataframe, dataframeArtists, how='inner', left_on='ALink', right_on='Link')




#mergeData.to_csv('merge.csv')
#rawData = shuffle(mergeData)
rawData= mergeData[['Lyric', 'Genre', 'Genres']]
rawData = rawData.drop_duplicates(subset='Lyric')


#rawData.dropna(axis=0, how='any',thresh=None, subset=None,inplace=True)
rawData.to_csv('rawData.csv')

#rawData.to_csv('merge.csv')
train, test = train_test_split(rawData, test_size=0.2)


#pathLyrics = 'lyrics-data.csv'
#test = pd.read_csv('test.csv', sep=';')
#rawData= test[['Lyric', 'Genre']]
#rawData = shuffle(rawData)
#train, test = train_test_split(rawData, test_size=0.2)

#Baseline

In [None]:
vectorizer = CountVectorizer()
X_cnt = vectorizer.fit_transform(rawData['Lyric'])

label_encoder = LabelEncoder()
y_clean = label_encoder.fit_transform(rawData['Genre'])

X_train, X_test, y_train, y_test = train_test_split(X_cnt, y_clean, test_size=0.2, random_state=0)

## Naive Bayes mit additiver Glättung trainieren
nb = MultinomialNB(alpha=1.0)
nb.fit(X_train, y_train)

## Vorhersagen berechnen
y_predicted = nb.predict(X_test)


## Konfusionsmatrix ausgeben
print("Kofusionsmatrix:\n", confusion_matrix(y_true=y_test, y_pred=y_predicted))

## Gütemaße ausgeben
print("Korrektklassifizierungsrate:\n", accuracy_score(y_true=y_test, y_pred=y_predicted))
print("Präzision (mikro):\n", precision_score(y_true=y_test, y_pred=y_predicted, average='micro'))
print("Ausbeute (mikro):\n", recall_score(y_true=y_test, y_pred=y_predicted, average='micro'))
print("F1 (mikro):\n", f1_score(y_true=y_test, y_pred=y_predicted, average='micro'))

In [None]:
def vectorize_sequences (dataframe, dimensions=10000):
    vect = CountVectorizer()
    X = vect.fit_transform(dataframe)
    tokenizer = Tokenizer(num_words=dimensions)
    tokenizer.fit_on_texts(dataframe)
    sequences = tokenizer.texts_to_sequences(dataframe)
    one_hot_results = tokenizer.texts_to_matrix(dataframe, mode='binary')
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    return one_hot_results
    
def vectorize_labels(labels):
    le = preprocessing.LabelEncoder()
    encodedlabels = le.fit(labels)
    encodedlabels_transformed= le.transform(labels) 
    return to_categorical(encodedlabels_transformed)


#Feed Forward Network

In [None]:
X_train = vectorize_sequences(train['Lyric'])
#y_train = vectorize_sequences(train['Genre'], 6)
y_train = vectorize_labels(train['Genre'])

In [None]:
model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(6, activation='softmax'))
#model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=50,batch_size=100, validation_split=0.2)

In [None]:
history_dict = history.history
train_loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(train_loss) + 1)
plt.title('Training and validation loss')
plt.plot(epochs, train_loss, label='Training loss')
plt.plot(epochs, val_loss, label='Validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()

In [None]:
X_test = vectorize_sequences(test['Lyric'])
Y_test = vectorize_labels(test['Genre']) 


# compute loss and accuracy on test data
score = model.evaluate(X_test, Y_test, verbose=1)
print("Test score:", score[0])
print("Test accuracy:", score[1])

In [None]:
history_dict = history.history
train_acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']

epochs = range(1, len(train_loss) + 1)
plt.title('Training and validation accuracy')
plt.plot(epochs, train_acc, label='Training accuracy')
plt.plot(epochs, val_acc, label='Validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid()
plt.show()

# Enhanced FFN

In [None]:

#tfidf_vectorizer=TfidfVectorizer(stop_words="english", use_idf=True, max_features=10000)
tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_features=10000) 
# just send in all your docs here
xtrain_tfidf = tfidf_vectorizer.fit(train['Lyric'])

X_train = xtrain_tfidf.transform(train['Lyric'])
y_train = vectorize_labels(train['Genre'])



In [None]:
model_tfidf = models.Sequential()
model_tfidf.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model_tfidf.add(layers.Dense(16, activation='relu'))
model_tfidf.add(layers.Dense(6, activation='softmax'))
model_tfidf.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
#model_tfidf.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
model_tfidf.summary()

In [None]:
history_tfidf = model_tfidf.fit(X_train, y_train, epochs=50, batch_size=100, validation_split=0.2)

In [None]:
history_dict = history_tfidf.history
train_loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(train_loss) + 1)
plt.title('Training and validation loss')
plt.plot(epochs, train_loss, label='Training loss')
plt.plot(epochs, val_loss, label='Validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()

In [None]:
xtest_tfidf = tfidf_vectorizer.fit(test['Lyric'])

X_test = xtest_tfidf.transform(test['Lyric'])


Y_test = vectorize_labels(test['Genre'])

# compute loss and accuracy on test data
score = model_tfidf.evaluate(X_test, Y_test, verbose=1)
print("Test score:", score[0])
print("Test accuracy:", score[1])

In [None]:
history_dict = history_tfidf.history
train_acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']

epochs = range(1, len(train_loss) + 1)
plt.title('Training and validation accuracy')
plt.plot(epochs, train_acc, label='Training accuracy')
plt.plot(epochs, val_acc, label='Validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid()
plt.show()