In [9]:
import csv
import os
import nltk
import pickle
from langdetect import detect_langs, LangDetectException
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.corpora import Dictionary

# Specify the path to your CSV file
input_path = 'test.csv'
output_path = 'clean_test_dataset.csv'

# Define the columns to keep
columns_to_keep = ['Artist', 'Song', 'Genre']

# Open the input file and read it
with open(input_path, 'r', encoding='utf-8') as input_file:
    data_reader = csv.DictReader(input_file)
    
    # Open the output file for writing
    with open(output_path, 'w', newline='', encoding='utf-8') as output_file:
        fieldnames = columns_to_keep + ['Cleaned_Lyrics']  # Add a new field for the cleaned lyrics
        data_writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        data_writer.writeheader()  # Write the header
        
        # Iterate over each row in the input data
        for row in data_reader:
            # Filter out the unwanted columns and the 'Lyrics' column
            filtered_row = {key: row[key] for key in columns_to_keep}
            
            # Check if the 'Lyrics' column is empty
            if not row['Lyrics']:
                continue
            
            # Clean and preprocess the lyrics
            # Tokenization
            tokenizer = RegexpTokenizer(r'\w+')
            tokenized_lyrics = tokenizer.tokenize(row['Lyrics'].lower())
            
            # Filtering out short tokens and non-alphanumeric tokens
            filtered_tokens = [token for token in tokenized_lyrics if len(token) > 2 and not token.isnumeric()]
            
            # Lemmatization
            lemmatizer = WordNetLemmatizer()
            lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
            
            # Stop words removal
            stop_words = stopwords.words('english')
            new_stop_words = ['ooh', 'yeah', 'hey', 'whoa', 'woah', 'ohh', 'was', 'mmm', 'oooh', 'yah', 'yeh', 'mmm', 'hmm', 'deh', 'doh', 'jah', 'wa']
            stop_words.extend(new_stop_words)
            cleaned_tokens = [token for token in lemmatized_tokens if token not in stop_words]
            
            # Filtering out non-meaningful words
            non_meaningful_words = ['aaaa', 'aaah', 'aaaah', 'oooooh', 'ooooooooh', 'oooooooooh', 'aaalways', 'oooooooooh', 'aah', 'aachoo', 'accident', 'alalalalchemist', 'aback', 'aboard', 'ghost', 'abacus', 'absurd', 'across', 'log', 'abominal', 'aaaaaaaalllllright', 'aaaaaaaaaaaaaa', 'aaaaaaaaaaaaaah', 'aaaaaaaah', 'baldheaded', 'aaaaaaaaaaaaaaa', 'aaaaaaaaggghhhhhhhhhhhhhh', 'aaaaaaaaah', 'afrikaa', 'aan', 'aback', 'abnoxious']
            cleaned_lyrics = ' '.join(token for token in cleaned_tokens if token not in non_meaningful_words)
            
            # Update the row with the cleaned lyrics
            filtered_row['Cleaned_Lyrics'] = cleaned_lyrics
            
            # Write the row to the output file
            data_writer.writerow(filtered_row)

print("Clean test dataset saved successfully as CSV.")


Clean test dataset saved successfully as CSV.


In [10]:
  from collections import Counter

# Open the cleaned CSV file and read it
with open(output_path, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    genre_counts = Counter(row['Genre'] for row in reader)

# Print the count of each genre
print("Count of each genre in the cleaned dataset:")
for genre, count in genre_counts.items():
    print(f"{genre}: {count}")


Count of each genre in the cleaned dataset:
Hip-Hop: 960
Indie: 510
Metal: 810
Pop: 1110
Country: 810
Jazz: 660
Rock: 1410
R&B: 510
Electronic: 660
Folk: 495


In [15]:
import csv
import os
import nltk
import pickle
from langdetect import detect_langs, LangDetectException
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.corpora import Dictionary

# Specify the path to your CSV file
input_path = '12000_lyrics_dataset.csv'
output_path = 'clean_train_dataset.csv'

# Define the columns to keep
columns_to_keep = ['Artist', 'Title', 'Genre']

# Open the input file and read it
with open(input_path, 'r', encoding='utf-8') as input_file:
    data_reader = csv.DictReader(input_file)
    
    # Open the output file for writing
    with open(output_path, 'w', newline='', encoding='utf-8') as output_file:
        fieldnames = columns_to_keep + ['Cleaned_Lyrics']  # Add a new field for the cleaned lyrics
        data_writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        data_writer.writeheader()  # Write the header
        
        # Iterate over each row in the input data
        for row in data_reader:
            # Filter out the unwanted columns and the 'Lyrics' column
            filtered_row = {key: row[key] for key in columns_to_keep}
            
            # Check if the 'Lyrics' column is empty
            if not row['Cleaned_Lyrics']:
                continue
            
            # Clean and preprocess the lyrics
            # Tokenization
            tokenizer = RegexpTokenizer(r'\w+')
            tokenized_lyrics = tokenizer.tokenize(row['Cleaned_Lyrics'].lower())
            
            # Filtering out short tokens and non-alphanumeric tokens
            filtered_tokens = [token for token in tokenized_lyrics if len(token) > 2 and not token.isnumeric()]
            
            # Lemmatization
            lemmatizer = WordNetLemmatizer()
            lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
            
            # Stop words removal
            stop_words = stopwords.words('english')
            new_stop_words = ['ooh', 'yeah', 'hey', 'whoa', 'woah', 'ohh', 'was', 'mmm', 'oooh', 'yah', 'yeh', 'mmm', 'hmm', 'deh', 'doh', 'jah', 'wa']
            stop_words.extend(new_stop_words)
            cleaned_tokens = [token for token in lemmatized_tokens if token not in stop_words]
            
            # Filtering out non-meaningful words
            non_meaningful_words = ['aaaa', 'aaah', 'aaaah', 'oooooh', 'ooooooooh', 'oooooooooh', 'aaalways', 'oooooooooh', 'aah', 'aachoo', 'accident', 'alalalalchemist', 'aback', 'aboard', 'ghost', 'abacus', 'absurd', 'across', 'log', 'abominal', 'aaaaaaaalllllright', 'aaaaaaaaaaaaaa', 'aaaaaaaaaaaaaah', 'aaaaaaaah', 'baldheaded', 'aaaaaaaaaaaaaaa', 'aaaaaaaaggghhhhhhhhhhhhhh', 'aaaaaaaaah', 'afrikaa', 'aan', 'aback', 'abnoxious']
            cleaned_lyrics = ' '.join(token for token in cleaned_tokens if token not in non_meaningful_words)
            
            # Update the row with the cleaned lyrics
            filtered_row['Cleaned_Lyrics'] = cleaned_lyrics
            
            # Write the row to the output file
            data_writer.writerow(filtered_row)

print("Clean train dataset saved successfully as CSV.")


Clean test dataset saved successfully as CSV.


In [16]:
import csv
from collections import defaultdict
import random

input_path = 'clean_test_dataset.csv'
filtered_output_path = 'filtered_test_dataset.csv'
balanced_output_path = 'balanced_test_dataset.csv'
num_samples_per_genre = 660

# Genurile dorite
desired_genres = {'Metal', 'Rap', 'Pop', 'Jazz'}

# Citește setul de date curățat și filtrează doar genurile dorite, înlocuind "Hip-Hop" cu "Rap"
filtered_data = []
with open(input_path, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        genre = row['Genre']
        if genre == 'Hip-Hop':
            genre = 'Rap'
        if genre in desired_genres:
            row['Genre'] = genre
            filtered_data.append(row)

# Scrie setul de date filtrat într-un fișier CSV
fieldnames = ['Artist', 'Title', 'Genre', 'Cleaned_Lyrics']
with open(filtered_output_path, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(filtered_data)

print(f"Filtered test dataset saved successfully as {filtered_output_path}.")

# Echilibrează setul de date
data_by_genre = defaultdict(list)
for row in filtered_data:
    data_by_genre[row['Genre']].append(row)

balanced_data = []
for genre, rows in data_by_genre.items():
    if len(rows) >= num_samples_per_genre:
        balanced_data.extend(random.sample(rows, num_samples_per_genre))
    else:
        balanced_data.extend(rows)

# Scrie setul de date echilibrat într-un fișier CSV
with open(balanced_output_path, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(balanced_data)

print(f"Balanced test dataset saved successfully as {balanced_output_path}.")


Filtered test dataset saved successfully as filtered_test_dataset.csv.
Balanced test dataset saved successfully as balanced_test_dataset.csv.


In [18]:
import csv
from collections import defaultdict
import random

input_path = 'clean_test_dataset.csv'
filtered_output_path = 'filtered_test_dataset2.csv'
balanced_output_path = 'balanced_test_dataset2.csv'
num_samples_per_genre = 510

# Genurile dorite
desired_genres = {'Metal', 'Rap', 'Pop', 'Jazz', 'Indie', 'Country', 'R&B', 'Electronic'}

# Citește setul de date curățat și filtrează doar genurile dorite, înlocuind "Hip-Hop" cu "Rap"
filtered_data = []
with open(input_path, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        genre = row['Genre']
        if genre == 'Hip-Hop':
            genre = 'Rap'
        if genre in desired_genres:
            row['Genre'] = genre
            filtered_data.append(row)
            
# Scrie setul de date filtrat într-un fișier CSV
fieldnames = ['Artist', 'Title', 'Genre', 'Cleaned_Lyrics']
with open(filtered_output_path, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(filtered_data)

print(f"Filtered test dataset saved successfully as {filtered_output_path}.")

# Echilibrează setul de date
data_by_genre = defaultdict(list)
for row in filtered_data:
    data_by_genre[row['Genre']].append(row)

balanced_data = []
for genre, rows in data_by_genre.items():
    if len(rows) >= num_samples_per_genre:
        balanced_data.extend(random.sample(rows, num_samples_per_genre))
    else:
        balanced_data.extend(rows)

# Scrie setul de date echilibrat într-un fișier CSV
with open(balanced_output_path, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(balanced_data)

print(f"Balanced test dataset saved successfully as {balanced_output_path}.")

Filtered test dataset saved successfully as filtered_test_dataset2.csv.
Balanced test dataset saved successfully as balanced_test_dataset2.csv.
