In [1]:
import nltk
import pandas as pd
import numpy as np

# NLP
import string
import re

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [2]:
def length(text):
    """
    "Return the length of the text."

    The function takes a single argument, text, and returns the length of the text

    :param text: The text to be analyzed
    :return: The length of the text.
    """
    return len(text)

In [3]:
def characterSubstitute(text):
    """
    It takes a string, removes all newline characters, removes all instances of the string "Hook 1", and replaces all
    multiple spaces with a single space

    :param text: the text to be processed
    :return: the text with the newlines, carriage returns, and Hook 1 removed.
    """
    regex = r'\n|\r|Hook 1|[0-9]|chorus'
    text = re.sub(regex, " ", text)
    text = re.sub(' +', ' ', text)
    return text

In [4]:
def removePunctuation(text):
    """
    * The function takes in a string and returns a new string which doesn't contain any punctuation.

    * For example, calling the function with the string `"Let's try, Mike."` should return `"Lets try Mike"`

    :param text: The text whose punctuations are to be removed
    :return: The text stripped of punctuation marks
    """
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [5]:
def applyStopWords(text):
    """a function for removing the stopword"""
    #stop_words = ['a', 'an', 'above', 'and', 'any', 'as', 'at', 'of', 'that', 'the', 'to']
    stop_words = stopwords.words('english')
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in stop_words]
    # joining the list of words with space separator
    return " ".join(text)

In [6]:
def stemming(text):
    """
    It takes a string of text, splits it into words, and then returns a string of text where each word is stemmed

    :param text: The text that you want to stem
    :return: the stemmed words in the text.
    """
    stemmer = SnowballStemmer("english")
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

In [7]:
def createDF1(file_name):
    """
    It reads the csv file, renames the columns, creates a binary column for the mood, and creates a column for the
    length of the lyrics

    :param file_name: the name of the file you want to import
    :return: A dataframe with the columns: lyrics, song name, valence, mood, length
    """
    # Import CSV file
    df = pd.read_csv(file_name, sep=',', index_col=[0])

    # Rename columns names: "seq": "lyrics", "song": "song name", "label":"valence"
    df.rename(columns={"seq": "lyrics", "song": "song name", "label": "valence"}, inplace=True)

    # Create 'length' column that represent the lyrics' number of words
    df['length'] = df['lyrics'].apply(length)
    df['length_log'] = np.log(df['length'])

    return df

In [8]:
def dataCleansing(df):
    """
    It takes in a dataframe, cleanses the lyrics' column, and returns a dataframe.

    :param df: the dataframe that contains the lyrics
    :return: A dataframe with the lyrics' column cleaned.
    """
    # Drop Duplicates
    df = df.drop_duplicates(subset=['lyrics']) 
    
    # Substitute special regex/characters
    df['lyrics'] = df['lyrics'].apply(characterSubstitute)

    # Remove punctuation
    df['lyrics'] = df['lyrics'].apply(removePunctuation)

    # Lowercase all words
    df['lyrics'] = df['lyrics'].apply(lambda x: x.lower())

    # Keep song with lyrics length between 500 and 2000
    df = df[(df['length'] < 2000) & (df['length'] > 500)] 
    
    df = df[(df['valence'] > 0.85) | (df['valence'] < 0.15)]
    
    # Create binary column: 1 represent "happy" mood while 0 represent "sad column"
    df['Mood'] = np.where(df['valence'] > 0.85, 1, 0)

    # Remove StopWords
    df['lyrics'] = df['lyrics'].apply(applyStopWords)

    # Stemming
    df['lyrics'] = df['lyrics'].apply(stemming)

    return df

In [9]:
def downSampling(df):
    """
    It takes a dataframe as input, and returns a dataframe with the
    same number of rows as the input dataframe, but with the same number of rows for each class

    :param df: The dataframe you want to down sample
    :return: A dataframe with the same number of negative and positive moods.
    """
    requires_n = df['Mood'].value_counts().min()
    print(requires_n)
    requires_n = 8000
    negative_mood = df[df['Mood'] == 0].sample(n=requires_n)
    positive_mood = df[df['Mood'] == 1].sample(n=requires_n)

    down_sampling_data = pd.concat([negative_mood, positive_mood])

    # The frac keyword argument specifies the fraction of rows to return to the random sample, so frac=1 means to
    # return all rows (in random order).
    down_sampling_data = down_sampling_data.sample(frac=1)
    return down_sampling_data

In [10]:
def handlingData():
    """
    It takes the raw data, cleans it, and down sample it
    :return: ml_data
    """
    file_name = '1_First Data.csv'
    data = createDF1(file_name)
    cleaned_data = dataCleansing(data)
    ml_data = downSampling(cleaned_data)
    ml_data.to_csv("2_ML data.csv")

    return ml_data

In [11]:
# It imports the pandas library and renames it to pd.
import pandas as pd

# Importing the train_test_split, TfidfVectorizer, and Pipeline functions from the sklearn library.
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Importing the models that we are going to use.
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB

# Importing the functions that we are going to use to evaluate the models.
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import plot_confusion_matrix

# It imports the pyplot module from the matplotlib library and renames it to plt.
from matplotlib import pyplot as plt

In [12]:
def createDF2(file_name):
    """
    It reads the csv file, renames the columns, creates a binary column for the mood, and creates a column for the
    length of the lyrics

    :param file_name: the name of the file you want to import
    :return: A dataframe with the columns: lyrics, song name, valence, mood, length
    """
    # Import CSV file
    df = pd.read_csv(file_name, usecols=['lyrics', 'Mood'])

    return df

In [13]:
def TFIDF(df):
    """
    It takes in a dataframe, and returns a dataframe with the lyrics column removed, and replaced with a TFIDF vectorized
    version of the lyrics column.

    :param df: the dataframe that contains the lyrics' column
    :return: A dataframe with the lyrics vectorized and the mood and length_log columns.
    """
    vectorizer = TfidfVectorizer(max_features=1000)
    x = vectorizer.fit_transform(df['lyrics'])
    
    print(vectorizer.get_feature_names_out())
    
    vectorizer_df = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())
    df.drop('lyrics', axis=1, inplace=True)  # Consist of 'mood' & 'length_log'
    result = pd.concat([df, vectorizer_df], axis=1)

    return result

In [14]:
def trainTestSplit(df):
    """
    It takes in a dataframe, drops the 'Mood' column, and then splits the dataframe into training and testing data

    :param df: the dataframe
    :return: X_train, X_test, y_train, y_test
    """
    x = df.drop(columns='Mood', axis=1)
    y = df['Mood']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
    
    x_train.to_csv("x_train.csv")
    x_test.to_csv("x_test.csv")
    y_train.to_csv("y_train.csv")
    y_test.to_csv("y_test.csv")
    
    return x_train, x_test, y_train, y_test

In [15]:
def trainModel(model, x_train, y_train):
    """
    It takes a model, a training set, and a training label, and returns a trained model

    :param model: the model to be trained
    :param x_raw_train: the training data
    :param y_train: the labels of the training data
    :return: The classifier is being returned.
    """
    classifier = Pipeline([('clf', model)])
    classifier.fit(x_train, y_train)
    return classifier

In [16]:
def basicModelPipeline():  
    """
    It takes in a dataframe, performs TFIDF on it, splits it into train and test sets, trains a bunch of models on the train
    set, and then prints out the metrics for each model on the test set.

    :param ml_data: the dataframe that contains the text and the labels
    """
    file_name = '2_ML Data.csv'
    ml_data = createDF2(file_name)
    df_after_TFIDF = TFIDF(ml_data)
    df_after_TFIDF.to_csv("3_Optimization Data.csv")
    x_train, x_test, y_train, y_test = trainTestSplit(df_after_TFIDF)
    
    ml_models = {
        'LogReg': LogisticRegression(),
        'LinearSVC': LinearSVC(),
        'DecisionTree': DecisionTreeClassifier(),
        'RandomForest': RandomForestClassifier(),
        'SVM': SVC(),
        'KNN': KNeighborsClassifier(),
        'AdaBoost': AdaBoostClassifier(),
        'Naive Bayes': MultinomialNB()
    }


    df_metrics = pd.DataFrame([])

    for model in ml_models:
        classifier = trainModel(ml_models[model], x_train, y_train)
        y_predict = classifier.predict(x_test)
        
        metrics = {}
        metrics['accuracy'] = accuracy_score(y_test, y_predict)
        metrics['precision'] = precision_score(y_test, y_predict)
        metrics['recall'] = recall_score(y_test, y_predict)
        metrics['f1'] = f1_score(y_test, y_predict, average='macro')
        df_metrics = pd.concat([df_metrics, pd.DataFrame(metrics, index=[model]).T], axis=1)
   
    print(df_metrics)
    df_metrics.to_csv("4_Models Results Before Fine Tuning.csv")

In [1]:
# Importing the pandas library and giving it the alias pd.
import json

import matplotlib
import pandas as pd

# Importing the numpy library and giving it the alias np.
import numpy as np
# Importing the mean function from the numpy library.
from numpy import mean

# Importing the necessary libraries for the model.
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Importing the standard error of the mean function from the scipy library.
from scipy.stats import sem

# Importing the pyplot module from the matplotlib library.
from matplotlib import pyplot, pyplot as plt
# %matplotlib inline

import seaborn as sns

In [2]:
def createDF(file_name):
    """
    It reads the csv file, renames the columns, creates a binary column for the mood, and creates a column for the
    length of the lyrics

    :param file_name: the name of the file you want to import
    :return: A dataframe with the columns: lyrics, song name, valence, mood, length
    """
    # Import CSV file
    df = pd.read_csv(file_name)

    return df

In [3]:
# evaluate a model with a given number of repeats
def evaluate_model(x, y, repeats):
    """
    It creates a logistic regression model and evaluates it using repeated k-fold cross-validation

    :param x: The input data
    :param y: The target variable
    :param repeats: the number of times to repeat the cross-validation procedure
    :return: The accuracy of the model
    """
    # prepare the cross-validation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=repeats, random_state=1)
    # create model
    model = RandomForestClassifier()
    # evaluate model
    scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

In [4]:
def crossValidation(x, y):
    """
    It will evaluate the model using a given number of repeats and summarize the results

    :param x: The input data
    :param y: The target variable
    """
    # configurations to test
    repeats = range(1, 6)
    results = list()
    print("Cross Validation scores (Mean & Standard Deviation)")
    for r in repeats:
        # evaluate using a given number of repeats
        scores = evaluate_model(x, y, r)
        # summarize
        print('>%d mean=%.4f se=%.3f' % (r, mean(scores), sem(scores)))
        # store
        results.append(scores)
        
    # plot the results
    pyplot.boxplot(results, labels=[str(r) for r in repeats], showmeans=True)
    pyplot.show()



In [5]:
def HyperRF(x_train, x_test, y_train, y_test):
    max_depth = [32, 64]
    n_estimators = [128, 256]
    
    # bootstrap= [True, False]
    # max_depth= [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
    # max_features= ['auto', 'sqrt']
    # min_samples_leaf= [1, 2, 4]
    # min_samples_split= [2, 5, 10]
    # n_estimators= [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]

    param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)

    # Build the grid search
    dfrst = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators)
    grid = GridSearchCV(estimator=dfrst, param_grid=param_grid, cv=10)
    grid_results = grid.fit(x_train, y_train)

    # Summarize the results in a readable format
    print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
    results_df = pd.DataFrame(grid_results.cv_results_)
    print(results_df)
    results_df.to_csv('Grid Search Results.csv')

    # Writing Json data into a file
    with open('Random forest Best Parameters.txt', 'w') as outfile:
        json.dump(grid_results.best_params_, outfile)

    # Extract the best decision forest
    best_clf = grid_results.best_estimator_
    y_pred = best_clf.predict(x_test)

    # Create a confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_pred)

    # Create heatmap from the confusion matrix

    class_names = [False, True]  # name  of classes
    fig, ax = plt.subplots(figsize=(7, 6))
    sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    tick_marks = [0.5, 1.5]
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)

    return best_clf

In [8]:
def fineTuning():
    file_name = '3_Optimization Data.csv'
    RF_data = createDF(file_name)

    x = RF_data.drop(columns='Mood', axis=1)
    y = RF_data['Mood']

    # crossValidation(x, y)
    print("step 1")
    # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
    
    x_train = pd.read_csv("x_train.csv")
    x_test = pd.read_csv("x_test.csv")
    y_train = pd.read_csv("y_train.csv")
    y_test = pd.read_csv("y_test.csv")
    
    model = HyperRF(x_train, x_test, y_train, y_test)

In [None]:
# handlingData()
# basicModelPipeline()
fineTuning()

print("OK!")

step 1


In [None]:
nltk.download('stopwords')

In [None]:
max_depth = [None, 32]
n_estimators = [100, 256]  
bootstrap= [True, False]

best_accuracy = 0
best_params = {}

for depth in max_depth:
    for estimator in n_estimators:
        for boostrap_para in bootstrap:
            classifier = trainModel(RandomForestClassifier(bootstrap=boostrap_para, max_depth=depth, n_estimators=estimator), x_train, y_train)
            y_predict = classifier.predict(x_test)
            accuracy = accuracy_score(y_test, y_predict)
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {'bootstrap':boostrap_para, 'max_depth':depth, 'n_estimators':estimator}