In [34]:
# Install spaCy 
!pip install spacy

# Download spaCy English language model
!python -m spacy download en_core_web_sm

import os
import spacy
import pandas as pd

# Load spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Define the path to the directory containing song lyrics files
data_path = r"C:/Users/dudud/collecting data/lyrics/Midnights_TheTillDawnEdition_"



Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/12.8 MB 3.2 MB/s eta 0:00:04
     ---------------------------------------- 0.1/12.8 MB 2.8 MB/s eta 0:00:05
      --------------------------------------- 0.3/12.8 MB 2.4 MB/s eta 0:00:06
     - -------------------------------------- 0.3/12.8 MB 2.3 MB/s eta 0:00:06
     - -------------------------------------- 0.4/12.8 MB 2.5 MB/s eta 0:00:05
     - -------------------------------------- 0.5/12.8 MB 2.5 MB/s eta 0:00:05
     - -------------------------------------- 0.6/12.8 MB 2.4 MB/s eta 0:00:06
     - -------------------------------------- 0.6/12.8 MB 2.4 MB/s eta 0:00:06
     -- ------------------------------------- 0.8/12.8 MB 2.4 MB/s eta 0:00:05
     -- ---------------------------------

In [35]:
# Initialize lists to store data for CSV
filenames = []
titles = []
documents = []
tokens_list = []
lemmas_list = []
pos_list = []

# Function to process and annotate a lyric file
def process_lyric_file(file_path):
    # Extract information from the file path
    filename = os.path.basename(file_path)
    song_title = os.path.splitext(filename)[0]  # Remove the file extension
    
    # Read the content of the file
    with open(file_path, "r", encoding="utf-8") as file:
        lyric_text = file.read()

    # Process the text using spaCy
    doc = nlp(lyric_text)
    
    # Extract tokens, lemmas, and parts of speech
    tokens = [token.text for token in doc]
    lemmas = [token.lemma_ for token in doc]
    pos = [token.pos_ for token in doc]

    # Append data to lists
    filenames.append(filename)
    titles.append(song_title)
    documents.append(lyric_text)
    tokens_list.append(tokens)
    lemmas_list.append(lemmas)
    pos_list.append(pos)

# Iterate through each file in the dataset folder
for file_name in os.listdir(data_path):
    if file_name.endswith(".txt"):  
        file_path = os.path.join(data_path, file_name)
        process_lyric_file(file_path)

# Create a DataFrame to store the data
data = {
    "Filename": filenames,
    "Title": titles,
    "Document": documents,
    "Tokens": tokens_list,
    "Lemmas": lemmas_list,
    "Parts-of-speech": pos_list,
}

df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = "C:/Users/dudud/collecting data/lyrics/annotated_corpus.csv"
df.to_csv(csv_path, index=False)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,Filename,Title,Document,Tokens,Lemmas,Parts-of-speech
0,Anti_Hero.txt,Anti_Hero,Anti-Hero Lyrics[Verse 1]\nI have this thing w...,"[Anti, -, Hero, Lyrics[Verse, 1, ], \n, I, hav...","[Anti, -, Hero, lyrics[verse, 1, ], \n, I, hav...","[PROPN, PROPN, PROPN, NOUN, X, PUNCT, SPACE, P..."
1,Bejeweled.txt,Bejeweled,"Bejeweled Lyrics[Verse 1]\nBaby love, I think ...","[Bejeweled, Lyrics[Verse, 1, ], \n, Baby, love...","[bejewel, lyrics[verse, 1, ], \n, Baby, love, ...","[VERB, NOUN, X, PUNCT, SPACE, PROPN, NOUN, PUN..."
2,BiggerThanTheWholeSky.txt,BiggerThanTheWholeSky,Bigger Than The Whole Sky Lyrics[Verse 1]\nNo ...,"[Bigger, Than, The, Whole, Sky, Lyrics[Verse, ...","[big, than, the, Whole, Sky, lyrics[verse, 1, ...","[ADJ, ADP, DET, PROPN, PROPN, NOUN, AUX, PUNCT..."
3,DearReader.txt,DearReader,"Dear Reader Lyrics[Verse 1]\nDear reader, if i...","[Dear, Reader, Lyrics[Verse, 1, ], \n, Dear, r...","[Dear, Reader, lyrics[verse, 1, ], \n, dear, r...","[PROPN, PROPN, NOUN, X, PUNCT, SPACE, ADJ, NOU..."
4,Glitch.txt,Glitch,Glitch Lyrics[Verse 1]\nWe were supposed to be...,"[Glitch, Lyrics[Verse, 1, ], \n, We, were, sup...","[glitch, lyrics[verse, 1, ], \n, we, be, suppo...","[VERB, NOUN, X, PUNCT, SPACE, PRON, AUX, VERB,..."
