In [4]:
# Import spacy
import spacy

# Install English language model
!spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------- ------------------------------ 2.9/12.8 MB 16.7 MB/s eta 0:00:01
     ----------------------- ---------------- 7.6/12.8 MB 20.4 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 21.3 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 20.1 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.express as px

In [6]:
# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Base directory
base_dir = "One Direction.texts"

# Lists to store data
filenames = []
titles = []
artists = []
albums = []
documents = []
texts = []
tokens = []
lemmas = []
pos = []

# Iterate through albums
for album in os.listdir(base_dir):
    album_path = os.path.join(base_dir, album)
    if os.path.isdir(album_path):
        # Iterate through songs in the album
        for song_file in os.listdir(album_path):
            if song_file.endswith('.txt'):
                # Extract song title (remove .txt extension)
                title = os.path.splitext(song_file)[0]
    
                # Read lyrics from the file
                with open(os.path.join(album_path, song_file), 'r', encoding='utf-8') as inf:
                    document = inf.read()
                
                # Process the text with spaCy
                doc = nlp(document)
                
                # Append data to lists
                filenames.append(song_file)
                titles.append(title)
                artists.append('One Direction')
                albums.append(album)
                documents.append(document)  
                #tokenization [(toke.text) for token in doc]
                tokens.append([token.text for token in doc])
                lemmas.append([token.lemma_ for token in doc])
                #I am doing the text enrichment of SpaCy aka toke.pos
                pos.append([token.pos_ for token in doc])


In [14]:
# Create DataFrame
paper_df = pd.DataFrame({
    "Filename": filenames,
    "Title": titles,
    "Artist": artists,
    "Album": albums,
    "Document": documents,
    "Tokens": tokens,
    "Lemmas": lemmas,
    "Tags": pos
})


In [15]:
# Display the first few rows of the DataFrame

paper_df.head()


Unnamed: 0,Filename,Title,Artist,Album,Document,Tokens,Lemmas,Tags
0,back for you.txt,back for you,One Direction,Take me Home,Back for you\n\nWhenever I close my eyes I pic...,"[Back, for, you, \n\n, Whenever, I, close, my,...","[back, for, you, \n\n, whenever, I, close, my,...","[ADV, ADP, PRON, SPACE, SCONJ, PRON, VERB, PRO..."
1,Change my mind.txt,Change my mind,One Direction,Take me Home,Change my mind\n\nThe end of the night\nWe sho...,"[Change, my, mind, \n\n, The, end, of, the, ni...","[change, my, mind, \n\n, the, end, of, the, ni...","[VERB, PRON, NOUN, SPACE, DET, NOUN, ADP, DET,..."
2,Cmon cmon.txt,Cmon cmon,One Direction,Take me Home,Cmon cmon\n\nThe one that I came with\nShe had...,"[Cmon, cmon, \n\n, The, one, that, I, came, wi...","[Cmon, cmon, \n\n, the, one, that, I, come, wi...","[PROPN, PROPN, SPACE, DET, NOUN, PRON, PRON, V..."
3,Heart attack.txt,Heart attack,One Direction,Take me Home,Heart attack\n\nBaby you got me sick\nI don’t ...,"[Heart, attack, \n\n, Baby, you, got, me, sick...","[heart, attack, \n\n, Baby, you, get, I, sick,...","[NOUN, NOUN, SPACE, PROPN, PRON, VERB, PRON, A..."
4,I would.txt,I would,One Direction,Take me Home,I would\n\nLately I found myself thinking\nBee...,"[I, would, \n\n, Lately, I, found, myself, thi...","[I, would, \n\n, lately, I, find, myself, thin...","[PRON, AUX, SPACE, ADV, PRON, VERB, PRON, VERB..."


In [16]:
paper_df['Document'] = paper_df['Document'].str.replace('\s+', ' ', regex=True).str.strip()
paper_df.head()

  paper_df['Document'] = paper_df['Document'].str.replace('\s+', ' ', regex=True).str.strip()


Unnamed: 0,Filename,Title,Artist,Album,Document,Tokens,Lemmas,Tags
0,back for you.txt,back for you,One Direction,Take me Home,Back for you Whenever I close my eyes I pictur...,"[Back, for, you, \n\n, Whenever, I, close, my,...","[back, for, you, \n\n, whenever, I, close, my,...","[ADV, ADP, PRON, SPACE, SCONJ, PRON, VERB, PRO..."
1,Change my mind.txt,Change my mind,One Direction,Take me Home,Change my mind The end of the night We should ...,"[Change, my, mind, \n\n, The, end, of, the, ni...","[change, my, mind, \n\n, the, end, of, the, ni...","[VERB, PRON, NOUN, SPACE, DET, NOUN, ADP, DET,..."
2,Cmon cmon.txt,Cmon cmon,One Direction,Take me Home,Cmon cmon The one that I came with She had to ...,"[Cmon, cmon, \n\n, The, one, that, I, came, wi...","[Cmon, cmon, \n\n, the, one, that, I, come, wi...","[PROPN, PROPN, SPACE, DET, NOUN, PRON, PRON, V..."
3,Heart attack.txt,Heart attack,One Direction,Take me Home,Heart attack Baby you got me sick I don’t know...,"[Heart, attack, \n\n, Baby, you, got, me, sick...","[heart, attack, \n\n, Baby, you, get, I, sick,...","[NOUN, NOUN, SPACE, PROPN, PRON, VERB, PRON, A..."
4,I would.txt,I would,One Direction,Take me Home,I would Lately I found myself thinking Been dr...,"[I, would, \n\n, Lately, I, found, myself, thi...","[I, would, \n\n, lately, I, find, myself, thin...","[PRON, AUX, SPACE, ADV, PRON, VERB, PRON, VERB..."


In [17]:
paper_df.to_csv('one_direction_songs_detailed.csv', index=False)

In [18]:
# Load metadata.
metadata_df = pd.read_csv('one_direction_songs_detailed.csv')
metadata_df.head()

Unnamed: 0,Filename,Title,Artist,Album,Document,Tokens,Lemmas,Tags
0,back for you.txt,back for you,One Direction,Take me Home,Back for you Whenever I close my eyes I pictur...,"['Back', 'for', 'you', '\n\n', 'Whenever', 'I'...","['back', 'for', 'you', '\n\n', 'whenever', 'I'...","['ADV', 'ADP', 'PRON', 'SPACE', 'SCONJ', 'PRON..."
1,Change my mind.txt,Change my mind,One Direction,Take me Home,Change my mind The end of the night We should ...,"['Change', 'my', 'mind', '\n\n', 'The', 'end',...","['change', 'my', 'mind', '\n\n', 'the', 'end',...","['VERB', 'PRON', 'NOUN', 'SPACE', 'DET', 'NOUN..."
2,Cmon cmon.txt,Cmon cmon,One Direction,Take me Home,Cmon cmon The one that I came with She had to ...,"['Cmon', 'cmon', '\n\n', 'The', 'one', 'that',...","['Cmon', 'cmon', '\n\n', 'the', 'one', 'that',...","['PROPN', 'PROPN', 'SPACE', 'DET', 'NOUN', 'PR..."
3,Heart attack.txt,Heart attack,One Direction,Take me Home,Heart attack Baby you got me sick I don’t know...,"['Heart', 'attack', '\n\n', 'Baby', 'you', 'go...","['heart', 'attack', '\n\n', 'Baby', 'you', 'ge...","['NOUN', 'NOUN', 'SPACE', 'PROPN', 'PRON', 'VE..."
4,I would.txt,I would,One Direction,Take me Home,I would Lately I found myself thinking Been dr...,"['I', 'would', '\n\n', 'Lately', 'I', 'found',...","['I', 'would', '\n\n', 'lately', 'I', 'find', ...","['PRON', 'AUX', 'SPACE', 'ADV', 'PRON', 'VERB'..."


In [19]:
# Remove .txt from title of each paper
paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '', regex=True)