# WEB SCRAPING

In [None]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup #tool that helps us navigate the file



# 0. Pitchfork: Best Songs of the 1980s

In [None]:
url80s = "https://pitchfork.com/features/lists-and-guides/9700-the-200-best-songs-of-the-1980s/"
response = requests.get(url80s)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
# Extracting ranking number
ranking_divs = soup.find_all('div', class_='heading-h3')
rankings = [div.text.strip().replace('.', '') for div in ranking_divs]

# Extracting artist, song name, year, review and reviewer

import re #provides tools for pattern matching and string manipulation.

song_info = soup.find_all('h2')
song_data = []
for info in song_info:
    # Extracting text from the h2 tag
    text = info.text.strip()
    
    # Using regular expressions to extract artist, song, and year
    match = re.search(r'(.+?):\s*(.*?)\s*\((\d+)\)', text)
    if match:
        artist = match.group(1).strip()
        song = match.group(2).strip()
        year = match.group(3).strip()
        
        # Extracting review
        review = info.find_next('p').text.strip()

        # Extracting reviewer
        reviewer = info.find_next('p').find_next('em').text.strip()
        
        # Appending all data to song_data
        song_data.append((artist, song, year, review, reviewer))

#Create the DF

columns = ['Artist', 'Song', 'Year', 'Review', 'Reviewer']
df80s = pd.DataFrame(song_data, columns=columns)

# Add the 'Ranking' column to the DataFrame
df80s['Ranking'] = rankings
df80s['Source'] = 'Pitchfork'
df80s['Source Title'] = 'The 200 Best Songs of the 1980s'

# Display the DataFrame
df80s.head(5)

In [None]:
#df80s.to_csv('df80s.csv', index=False)

# 1. Pitchfrok: Best 250 songs of the 1990s

In [None]:
url90s = "https://pitchfork.com/features/lists-and-guides/the-best-songs-of-the-1990s/"
response = requests.get(url90s)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

"In this case, the name of the reviewer is included at the end of the review"

In [None]:
# Extracting ranking number
ranking_divs_90s = soup.find_all('div', class_='heading-h3')
rankings_90s = []

for div in ranking_divs_90s:
    text = div.text.strip().replace('.', '')
    if text:
        rankings_90s.append(text)

In [None]:
# Extracting artist, song name, year, review and reviewer

import re #provides tools for pattern matching and string manipulation.

song_info = soup.find_all('h2')
song_data = []
for info in song_info:
    # Extracting text from the h2 tag
    text = info.text.strip()
    
    # Using regular expressions to extract artist, song, and year
    match = re.search(r'(.+?):\s*(.*?)\s*\((\d+)\)', text)
    if match:
        artist = match.group(1).strip()
        song = match.group(2).strip()
        year = match.group(3).strip()
        
        # Extracting review
        review = info.find_next('p').text.strip()

        # Extracting reviewer (in this case name is included in the review)
        reviewer_match = re.search(r'–\s*(.+)$', review)
        if reviewer_match:
            reviewer = reviewer_match.group(1).strip()
            # Remove the reviewer's name from the review text
            review = review[:reviewer_match.start()].strip()
        else:
            reviewer = None
        
        # Appending all data to song_data
        song_data.append((artist, song, year, review, reviewer))

#Create the DF

columns = ['Artist', 'Song', 'Year', 'Review', 'Reviewer']
df90s = pd.DataFrame(song_data, columns=columns)

# # Add the 'Ranking' column to the DataFrame
df90s['Ranking'] = rankings_90s
df90s['Source'] = 'Pitchfork'
df90s['Source Title'] = 'The 250 Best Songs of the 1990s'

# Display the DataFrame
df90s.head(5)

In [None]:
#df90s.to_csv('df90s.csv', index=False)

# 2. Pitchfork: The Top 100 Singles of 2000-04

In [None]:
url00s = "https://pitchfork.com/features/lists-and-guides/5949-the-top-100-singles-of-2000-04-part-one/"
response = requests.get(url00s)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
strong_tag = soup.find_all('strong')
strong_tag

In [None]:
# Find all <p> elements
p_tags = soup.find_all('p')

#Create a list to store the text of each <p> tag
p_texts = []

# Split the text of each <p> tag and append it to the list
for p_tag in p_tags:
    p_texts.append(p_tag.get_text())

### Extracting the Reviews and Reviewers

In [None]:
p_texts = soup.find_all('p')

# Find the index of the first element starting with "0"
start_index = next((i for i, p_text in enumerate(p_texts) if p_text.text[0].isdigit()), None)

# Remove elements before the first "0" element
p_texts = p_texts[start_index:]

# Remove last 12 elements
p_texts = p_texts[:-12]

# Keeping only the rank for the "strong" elements
for i in range(len(p_texts)):
    if p_texts[i].text.startswith(('0', '1')):
        p_texts[i] = p_texts[i].text[:3]

# Convert p_text to string, otherwise too difficult to work with

p_texts = [str(item) for item in p_texts]

# Add a digit at the end of the list for the next code to rode

p_texts.append("999")

#Concatenate the reviews that have been split and create a review list

reviews00s = []
current_review = []

for item in p_texts:
    if item.isdigit():
        if current_review:
            reviews00s.append(' '.join(current_review))
            current_review = []  # Reset current review for the next one
    else:
        current_review.append(item.strip())

# Repeat the last review
reviews00s.append(reviews00s[-1])

len(reviews00s) #We get 99 results because Outkast has one review for the first and second positions

# Extract the reviewers
#Reviewers have the following format:




In [None]:
# Extract the reviewers from each review
reviewers00s = []
for review in reviews00s:
    match = re.search(r"[.?!] --([A-Z][\w\s,'‘’-]+)(?:[\"'“”<]|\!)", review)
    if match:
        reviewers00s.append(match.group(1).strip())
    else:
        reviewers00s.append("Unknown")

reviewers00s

In [None]:
song_info = soup.find_all('strong')
song_data = []
ranking = []
artists = []
songs = []
years = []


for info in song_info:
    # Extracting text from the strong tag
    text = info.get_text(separator='\n').split('\n')
    rank = text[0].split(':')[0].strip().lstrip('0') or '0' #remove any leading zeros before the first non-zero digit in the ranking number
    artist = text[0].split(':')[1].strip()
    song = text[1].strip()
    
    # Check if the text has 3 or 4 parts (some songs are splitted in 4 parts)
    if len(text) == 3:
        year = text[2].strip().replace('[', '').replace(']', '')
    elif len(text) == 4:
        year = text[3].strip().replace('[', '').replace(']', '')
    else:
        year = None
    
    # Extract the year digit
    if ';' in year or ':' in year:
        year_parts = re.split(r'[;:]', year)
        year_digit = year_parts[-1].strip()
    else:
        year_digit = year
    
 
    ranking.append(rank)
    song_data.append(text)
    artists.append(artist)
    songs.append(song)
    years.append(year_digit)

    
    


# #Create the DF



# # # Add the 'Ranking' column to the DataFrame
# df00s['Ranking'] = ranking
# 

# # Display the DataFrame
# df00s.head(5)

In [None]:
# Create an empty DataFrame
df00s = pd.DataFrame()

# Add columns to the DataFrame one by one

df00s['Artist'] = artists
df00s['Song'] = songs
df00s['Year'] = years
df00s['Review'] = reviews00s
df00s['Reviewer'] = reviewers00s
df00s['Ranking'] = ranking
df00s['Source'] = 'Pitchfork'
df00s['Source Title'] = 'The Top 100 Singles of 2000-04'

In [None]:
df00s[df00s['Reviewer'] == 'Unknown'] ##there are some reviewers missing let's fix those:

# Create a list of replacement reviewers
replacement_reviewers = ['Joe Tangari', 'David Raposa', 'Stephen M. Deusner', 'Stephen M. Deusner', 'Michael Idov', 'Stephen M. Deusner', 'Julianne Shepherd', 'Dominique Leone', 'Stephen M. Deusner', 'Drew Daniel']

# Iterate over the DataFrame and replace "Unknown" reviewers with values from the replacement list
for i, reviewer in enumerate(replacement_reviewers):
    df00s.loc[df00s['Reviewer'] == 'Unknown', 'Reviewer'] = reviewer

# Verify the changes
df00s

In [None]:
#Clean the review column and the song one

# Define a function to clean the review text
def clean_review(text):
    # Remove HTML tags like <p>...</p>
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text

# Apply the clean_review function to the review column
df00s['Review'] = df00s['Review'].apply(clean_review)

# Define a function to remove the reviewer's name from the review text
def remove_reviewer(text):
    # Use regular expression to find the last occurrence of "--" followed by the reviewer's name
    match = re.search(r'--[^-]*$', text)
    if match:
        # If match is found, remove everything after it
        cleaned_text = text[:match.start()].strip()
    else:
        # If no match is found, return the original text
        cleaned_text = text
    return cleaned_text

# Apply the remove_reviewer function to the review column
df00s['Review'] = df00s['Review'].apply(remove_reviewer)

#clear the song

df00s['Song'] = df00s['Song'].str[1:-1]


In [None]:
# df00s.to_csv('df00s.csv', index=False)


# 3. Pitchfork: Top 50 Singles of 2003

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url03 = "https://pitchfork.com/features/lists-and-guides/5924-top-50-singles-of-2003/"
response = requests.get(url03)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
all_p = soup.find_all('p')
all_p


# Initialize a list to store the reviews
reviews = []

# Iterate through each <p> tag to extract the review text
for tag in all_p:
    review_text = tag.text.strip()
    reviews.append(review_text)

# Print the extracted reviews
for review in reviews:
    print(review)
    
reviews = reviews[:-12]
reviews

In [None]:
sublists = []
current_sublist = []

# Iterate through each review
for review in reviews:
    # Check if the review starts with a digit
    if review[0].isdigit():
        # If it does, start a new sublist
        if current_sublist:
            sublists.append(current_sublist)
        current_sublist = [review]
    else:
        # If it doesn't, append to the current sublist
        current_sublist.append(review)

# Append the last sublist
if current_sublist:
    sublists.append(current_sublist)

sublists = sublists[1:]

# Print the sublists
for sublist in sublists:
    print(sublist)

print(len(sublists))

In [None]:
artist_songs2003 = []
review2003 = []
concatenated_sublists = []

for sublist in sublists:
    # Check if the sublist has more than one element
    if len(sublist) >= 2:
        # Concatenate the rest of the elements in the sublist
        artist_songs2003.append(sublist[0])
        review2003.append(''.join(sublist[1:]))
    else:
        # Keep the sublist as it is
        concatenated_sublists.append(sublist)

# Now 'concatenated_sublists' contains the desired sublists
for sublist in concatenated_sublists:
    print(sublist)

type(concatenated_sublists)

## Extracting Artist and Songs

In [None]:
# Splitting each song into two parts
split_songs = [song.split('"') for song in artist_songs2003]

# Creating separate lists for the first and second parts
first_parts = [song[0] for song in split_songs]
second_parts = [song[1] for song in split_songs]


print(len(first_parts))

print(len(second_parts))

In [None]:
artist_cleaned2003 = []

for song in first_parts:
    # Find the index of the first occurrence of two digits
    index = next(i for i, c in enumerate(song) if c.isdigit() and song[i+1].isdigit())
    # Keep only what comes after the index
    cleaned_name = song[index:].split(": ")[1].strip()
    artist_cleaned2003.append(cleaned_name)

# Remove trailing " character from each artist name
artist_cleaned2003 = [name.rstrip('"') for name in artist_cleaned2003]

# Displaying the cleaned artist names
print("Cleaned artist names:")
print(artist_cleaned2003)

In [None]:
songs_cleaned2003 = second_parts
print(len(songs_cleaned))

## Extracting Reviewers

In [None]:
#Extract the reviewer:

# Initialize a list to store reviewer names
reviewer_full_names2003 = []
cleaned_reviews2003 = []

# Iterate through each review in the list
for name in review2003:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', name)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        # Extract the reviewer's full name
        reviewer_full_name = parts[1].strip()
        # Append the reviewer's full name to the list
        reviewer_full_names2003.append(reviewer_full_name)
        #Extract review
        cleaned_reviews2003.append(parts[0].strip())

# Print the extracted reviewer full names
for full_name in reviewer_full_names2003:
    print("Reviewer's Full Name:", full_name)
    print()

# Print the cleaned reviews
print("Cleaned Reviews:")
for review in cleaned_reviews2003:
    print(review)

print(len(reviewer_full_names2003))
print(len(cleaned_reviews2003))

### Clean the reviewers

In [None]:
# Initialize a list to store the cleaned strings
reviewers_2003_final = []

# Define a regular expression pattern to capture the full name before a non-alphabetic character not typically part of names or "By"
pattern = re.compile(r"^(.*?)(?=\*|\d|\/| By)")

# Iterate over each string in the list
for string in reviewer_full_names2003:
    # Check if the pattern is found in the string
    match = re.search(pattern, string)
    if match:
        # If the pattern is found, use the matched group which captures the full name
        cleaned_string = match.group(1).strip()
    else:
        # If the pattern is not found, keep the original string
        cleaned_string = string
    # Append the cleaned string to the list
    reviewers_2003_final.append(cleaned_string)

print(len(reviewers_2003_final))

# Print the cleaned strings
for cleaned_string in reviewers_2003_final:
    print(cleaned_string)

## Create Additional Columns

In [None]:
# Create the ranking: 

rank_50 = [i for i in range(50, 0, -1)]

year_2003 = [2003] * 50

source_2003 = ['Pitchfork'] * 50

source_title_2003 = ['Top 50 Singles of 2003'] * 50

columns_list2003 = [artist_cleaned2003, songs_cleaned2003, year_2003, cleaned_reviews2003, reviewers_2003_final, rank_50, source_2003, source_title_2003]
for list in columns_list2003:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create DF 2003

In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2003))

# Create a DataFrame from the dictionary
df2003 = pd.DataFrame(data_dict)

#df2003.to_csv('df2003.csv', index=False)

# 4 Pitchfork: Top 50 Singles of 2004

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url04 = "https://pitchfork.com/features/lists-and-guides/5933-top-50-singles-of-2004/"
response = requests.get(url04)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
all_p = soup.find_all('p')
all_p


# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each <p> tag to extract and clean the review text
for tag in all_p:
    # Extract the review text
    review_text = tag.text.strip()
    
    # Check if the review starts with two or more underscores or non-alphanumeric characters
    if re.match(r'^[_\W]{2,}', review_text):
        # Remove the starting underscores or non-alphanumeric characters
        cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
    else:
        cleaned_review = review_text
    
    # Append the cleaned review to the list
    cleaned_reviews.append(cleaned_review)

# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

In [None]:
sublists = []
current_sublist = []

# Iterate through each review
for review in cleaned_reviews:
    # Check if the review starts with a digit
    if review[0].isdigit():
        # If it does, start a new sublist
        if current_sublist:
            sublists.append(current_sublist)
        current_sublist = [review]
    else:
        # If it doesn't, append to the current sublist
        current_sublist.append(review)

# Append the last sublist
if current_sublist:
    sublists.append(current_sublist)

#delete first and last element of sublist
sublists = sublists[1:-1]

# Print the sublists
for sublist in sublists:
    print(sublist)

In [None]:
## Keep first element and concatenate the rest for each sublist

resulting_sublists = []


for sublist in sublists:
    combined = []
    #keep first element
    first_element = sublist[0]
    #concatenate the rest
    rest_element = "".join(sublist[1:])
    #combine the first and rest element
    combined.append(first_element)
    combined.append(rest_element)
    #append result to resulting_sublists
    resulting_sublists.append(combined)
    

resulting_sublists

## Extract artist_song and reviews

In [None]:
# Initialize lists to store the extracted elements
artist_song2004 = []
reviews2004 = []

# Iterate through each sublist
for sublist in resulting_sublists:
    # Extract the first element of the sublist (artist/song)
    artist_song = sublist[0]
    # Extract the second element of the sublist (review)
    review = sublist[1]
    # Append the extracted elements to their respective lists
    artist_song2004.append(artist_song)
    reviews2004.append(review)

# Display the extracted lists
print("Artist/Song 2004:")
print(artist_song2004)
print("\nReviews 2004:")
print(reviews2004)
print(len(artist_song2004))
print(len(reviews2004))


### Extract Artist and Song

In [None]:
artist_song2004

In [None]:
#There is one case where the song does not have " (Belle and Sebastian)
artists2004 = []
songs2004 = []
pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')

# Iterate through each song in the list
for song in artist_song2004:
    # Check if the song contains a "
    if '"' not in song:
        # If no ", split on ":"
        parts = re.split(pattern, song)
        # Extract the artist name and song title
        if len(parts) >= 2:
            artist = parts[0]
            song_title = parts[1]
    else:
        # If ", split on '"'
        parts = song.split('"')
        if len(parts) >= 2:
            # Extract the artist name
            artist = parts[0].strip()
            # Extract the song title
            song_title = parts[1].strip()
        
    # Append the artist and song title to their respective lists
    artists2004.append(artist)
    songs2004.append(song_title)

# Display the extracted lists
print("Artists:")
print(artists2004)
print("\nSongs:")
print(songs2004)
print(len(artists2004))
print(len(songs2004))

### Extract Cleaned Artists

In [None]:
artist_cleaned2004_ = []

# Iterate through each artist name in the artists2004 list
for artist in artists2004:
    # Remove the first four characters and trailing underscores
    cleaned_name = artist[4:].rstrip('_')
    print(cleaned_name)
    # Append the cleaned artist name to the artist_cleaned2004 list
    artist_cleaned2004_.append(cleaned_name)

print(len(artist_cleaned2004_))
# Displaying the cleaned artist names
#print("Cleaned artist names:")
#print(artist_cleaned2004)

## Extract Reviews and Reviewers

In [None]:
# Initialize lists to store reviewer names and cleaned reviews
reviewer_full_names_2004 = []
cleaned_reviews_2004 = []

# Iterate through each review in the reviews2004 list
for name in reviews2004:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', name)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        # Extract the reviewer's full name and append it to the list
        reviewer_full_names_2004.append(parts[1].strip())
        # Extract the review and append it to the list
        cleaned_reviews_2004.append(parts[0].strip())

# Print the extracted reviewer full names
for full_name in reviewer_full_names_2004:
    print("Reviewer's Full Name:", full_name)
    print()

# Print the cleaned reviews
print("Cleaned Reviews:")
for review in cleaned_reviews_2004:
    print(review)

print(len(reviewer_full_names_2004))
print(len(cleaned_reviews_2004))

#Clean the reviewer_full_names_2004

In [None]:
# Initialize a list to store the cleaned strings
reviewers_2004_final = []

# Define a regular expression pattern to match lowercase followed by uppercase
pattern = re.compile(r'([a-z])([A-Z])')

# Iterate over each string in the list
for string in reviewer_full_names_2004:
    # Check if the pattern is found in the string
    match = re.search(pattern, string)
    if match:
        # If the pattern is found, split the string at the match and keep the first part
        cleaned_string = string[:match.start()]
    else:
        # If the pattern is not found, keep the original string
        cleaned_string = string
    # Append the cleaned string to the list
    reviewers_2004_final.append(cleaned_string)

# Print the cleaned strings
for cleaned_string in reviewers_2004_final:
    print(cleaned_string)

## Create Additional Columns

In [None]:
# Create the ranking: 

rank_50 = [i for i in range(50, 0, -1)]

year_2004 = [2004] * 50

source_2004 = ['Pitchfork'] * 50

source_title_2004 = ['Top 50 Singles of 2004'] * 50

columns_list2004 = [artist_cleaned2004_, songs2004, year_2004, reviews2004, reviewers_2004_final, rank_50, source_2004, source_title_2004]
for list in columns_list:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create DF 2004

In [None]:
columns_list2004 = [artist_cleaned2004_, songs2004, year_2004, reviews2004, reviewers_2004_final, rank_50, source_2004, source_title_2004]
columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list))

# Create a DataFrame from the dictionary
df2004 = pd.DataFrame(data_dict)

#df2004.to_csv('df2004.csv', index=False)

# 5. Pitchfork: Top 50 Singles of 2005

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url05 = "https://pitchfork.com/features/lists-and-guides/6221-top-50-singles-of-2005/"
response = requests.get(url05)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
all_p = soup.find_all('p')
all_p


# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each <p> tag to extract and clean the review text
for tag in all_p:
    # Extract the review text
    review_text = tag.text.strip()
    
    # Check if the review starts with two or more underscores or non-alphanumeric characters
    if re.match(r'^[_\W]{2,}', review_text):
        # Remove the starting underscores or non-alphanumeric characters
        cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
    else:
        cleaned_review = review_text
    
    # Append the cleaned review to the list
    cleaned_reviews.append(cleaned_review)

# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

In [None]:
sublists = []
current_sublist = []

# Iterate through each review
for review in cleaned_reviews:
    # Check if the review starts with a digit
    if review[0].isdigit():
        # If it does, start a new sublist
        if current_sublist:
            sublists.append(current_sublist)
        current_sublist = [review]
    else:
        # If it doesn't, append to the current sublist
        current_sublist.append(review)

# Append the last sublist
if current_sublist:
    sublists.append(current_sublist)

#delete first and last element of sublist
sublists = sublists[1:-1]

# Print the sublists
for sublist in sublists:
    print(sublist)

In [None]:
print(len(sublists))

In [None]:
resulting_sublists = []


for sublist in sublists:
    combined = []
    #keep first element
    first_element = sublist[0]
    #concatenate the rest
    rest_element = " ".join(sublist[1:])
    #combine the first and rest element
    combined.append(first_element)
    combined.append(rest_element)
    #append result to resulting_sublists
    resulting_sublists.append(combined)
    

print(len(resulting_sublists))

In [None]:
resulting_sublists

In [None]:
# Initialize lists to store the extracted elements
artist_song2005 = []
reviews2005 = []

# Iterate through each sublist
for sublist in resulting_sublists:
    # Extract the first element of the sublist (artist/song)
    artist_song = sublist[0]
    # Extract the second element of the sublist (review)
    review = sublist[1]
    # Append the extracted elements to their respective lists
    artist_song2005.append(artist_song)
    reviews2005.append(review)

# Display the extracted lists
print("Artist/Song 2004:")
print(artist_song2005)
print("\nReviews 2004:")
print(reviews2005)
print(len(artist_song2005))
print(len(reviews2005))

## Separate Artist and Song

In [None]:
#There is one case where the song does not have " (Belle and Sebastian)
artists2005 = []
songs2005 = []
pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')

# Iterate through each song in the list
for song in artist_song2005:
    # Check if the song contains a "
    if '"' not in song:
        # If no ", split on ":"
        parts = re.split(pattern, song)
        # Extract the artist name and song title
        if len(parts) >= 2:
            artist = parts[0]
            song_title = parts[1]
    else:
        # If ", split on '"'
        parts = song.split('"')
        if len(parts) >= 2:
            # Extract the artist name
            artist = parts[0].strip()
            # Extract the song title
            song_title = parts[1].strip()
        
    # Append the artist and song title to their respective lists
    artists2005.append(artist)
    songs2005.append(song_title)

# Display the extracted lists
print("Artists:")
print(artists2005)
print("\nSongs:")
print(songs2005)
print(len(artists2005))
print(len(songs2005))

## Extract Artist

In [None]:
artist_cleaned2005 = []

# Iterate through each artist name in the artists2004 list
for artist in artists2005:
    # Remove the first four characters and trailing underscores
    cleaned_name = artist[4:].rstrip('_')
    print(cleaned_name)
    # Append the cleaned artist name to the artist_cleaned2004 list
    artist_cleaned2005.append(cleaned_name)

print(len(artist_cleaned2005))
# Displaying the cleaned artist names
#print("Cleaned artist names:")
#print(artist_cleaned2004)

## Extracting Reviewers and Reviews

In [None]:
# Initialize lists to store reviewer names and cleaned reviews
reviewer_full_names_2005 = []
cleaned_reviews_2005 = []

# Iterate through each review in the reviews2004 list
for name in reviews2005:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', name)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        # Extract the reviewer's full name and append it to the list
        reviewer_full_names_2005.append(parts[1].strip())
        # Extract the review and append it to the list
        cleaned_reviews_2005.append(parts[0].strip())

# Print the extracted reviewer full names
for full_name in reviewer_full_names_2005:
    print("Reviewer's Full Name:", full_name)
    print()

# Print the cleaned reviews
print("Cleaned Reviews:")
for review in cleaned_reviews_2005:
    print(review)

print(len(reviewer_full_names_2005))
print(len(cleaned_reviews_2005))

#Clean the reviewer_full_names_2005

### Clean the reviewers

In [None]:
reviewers_2005_final = []

# Define a regular expression pattern to capture the first full name until " By"
pattern = re.compile(r"^(.+?) By")

# Iterate over each string in the list
for string in reviewer_full_names_2005:
    # Check if the pattern is found in the string
    match = re.search(pattern, string)
    if match:
        # If the pattern is found, use the matched group which captures the first name
        cleaned_string = match.group(1)
    else:
        # If the pattern is not found, keep the original string
        cleaned_string = string
    # Append the cleaned string to the list
    reviewers_2005_final.append(cleaned_string)

print(len(reviewers_2005_final))

# Print the cleaned strings
for cleaned_string in reviewers_2005_final:
    print(cleaned_string)



## Create Additional Columns

In [None]:
# Create the ranking: 

rank_50 = [i for i in range(50, 0, -1)]

year_2005 = [2005] * 50

source_2005 = ['Pitchfork'] * 50

source_title_2005 = ['Top 50 Singles of 2005'] * 50

columns_list2005 = [artist_cleaned2005, songs2005, year_2005, cleaned_reviews_2005, reviewers_2005_final, rank_50, source_2005, source_title_2005]
for list in columns_list2005:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create DF 2005

In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2005))

# Create a DataFrame from the dictionary
df2005 = pd.DataFrame(data_dict)

#df2005.to_csv('df2005.csv', index=False)

# 6 Pitchfork: The Top 100 Tracks of 2006

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url06 = "https://pitchfork.com/features/lists-and-guides/6508-the-top-100-tracks-of-2006/"
response = requests.get(url06)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
all_p = soup.find_all('p')
all_p


# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each <p> tag to extract and clean the review text
for tag in all_p:
    # Extract the review text
    review_text = tag.text.strip()
    
    # Check if the review starts with two or more underscores or non-alphanumeric characters
    if re.match(r'^[_\W]{2,}', review_text):
        # Remove the starting underscores or non-alphanumeric characters
        cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
    else:
        cleaned_review = review_text
    
    # Append the cleaned review to the list
    cleaned_reviews.append(cleaned_review)

# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

In [None]:
# Initialize a new list to store the filtered tags (#This list includes some MP3)
filtered_tags = []

# Iterate through each tag
for tag in cleaned_reviews:
    # Check if the tag does not start with "MP3"
    if not tag.startswith("MP3"):
        # If it doesn't start with "MP3", add it to the filtered list
        filtered_tags.append(tag)

# Display the filtered tags
for tag in filtered_tags:
    print(tag)

In [None]:
sublists = []
current_sublist = []

# Iterate through each review
for review in filtered_tags:
    # Check if the review starts with a digit
    if review[0].isdigit():
        # If it does, start a new sublist
        if current_sublist:
            sublists.append(current_sublist)
        current_sublist = [review]
    else:
        # If it doesn't, append to the current sublist
        current_sublist.append(review)

# Append the last sublist
if current_sublist:
    sublists.append(current_sublist)

#delete first and last element of sublist
sublists = sublists[1:-1]

# Print the sublists
for sublist in sublists:
    print(sublist)

len(sublists)

In [None]:
sublists[33]

In [None]:
#There is one extra line, fixing that: 

# Combine the sublists at index 42 and 43
combined_sublist = sublists[33] + sublists[34]

# Remove the sublist at index 34
del sublists[34]

#Update the combined sublist at index 33
sublists[33] = combined_sublist

# Display the updated concatenated_sublists
for sublist in sublists:
    print(sublist)



In [None]:
resulting_sublists = []


for sublist in sublists:
    combined = []
    #keep first element
    first_element = sublist[0]
    #concatenate the rest
    rest_element = "".join(sublist[1:])
    #combine the first and rest element
    combined.append(first_element)
    combined.append(rest_element)
    #append result to resulting_sublists
    resulting_sublists.append(combined)
    

resulting_sublists

## Extract artist_song and Reviewers

In [None]:
# Initialize lists to store the extracted elements
artist_song2006 = []
reviews2006 = []

# Iterate through each sublist
for sublist in resulting_sublists:
    # Extract the first element of the sublist (artist/song)
    artist_song = sublist[0]
    # Extract the second element of the sublist (review)
    review = sublist[1]
    # Append the extracted elements to their respective lists
    artist_song2006.append(artist_song)
    reviews2006.append(review)

# Display the extracted lists
print("Artist/Song 2006:")
print(artist_song2006)
print("\nReviews 2006:")
print(reviews2006)
print(len(artist_song2006))
print(len(reviews2006))

## Separate Artist and Song 

In [None]:
#There is one case where the song does not have " (Belle and Sebastian)
artists2006 = []
songs2006 = []
pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')

# Iterate through each song in the list
for song in artist_song2006:
    # Check if the song contains a "
    if '"' not in song:
        # If no ", split on ":"
        parts = re.split(pattern, song)
        # Extract the artist name and song title
        if len(parts) >= 2:
            artist = parts[0]
            song_title = parts[1]
    else:
        # If ", split on '"'
        parts = song.split('"')
        if len(parts) >= 2:
            # Extract the artist name
            artist = parts[0].strip()
            # Extract the song title
            song_title = parts[1].strip()
        
    # Append the artist and song title to their respective lists
    artists2006.append(artist)
    songs2006.append(song_title)

# Display the extracted lists
print("Artists:")
print(artists2006)
print("\nSongs:")
print(songs2006)
print(len(artists2006))
print(len(songs2006))

## Clean Artists

In [None]:
artist_cleaned2006 = []

# Iterate through each artist name in the artists2004 list
for artist in artists2006:
    # Remove the first four characters and trailing underscores
    cleaned_name = artist[4:].rstrip('_')
    print(cleaned_name)
    # Append the cleaned artist name to the artist_cleaned2004 list
    artist_cleaned2006.append(cleaned_name)

print(len(artist_cleaned2006))
# Displaying the cleaned artist names
#print("Cleaned artist names:")
#print(artist_cleaned2004)

## Extract Review and Reviewers

In [None]:
# Initialize lists to store reviewer names and cleaned reviews
reviewer_full_names_2006 = []
cleaned_reviews_2006 = []

# Iterate through each review in the reviews2004 list
for name in reviews2006:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', name)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        # Extract the reviewer's full name and append it to the list
        reviewer_full_names_2006.append(parts[1].strip())
        # Extract the review and append it to the list
        cleaned_reviews_2006.append(parts[0].strip())

# Print the extracted reviewer full names
for full_name in reviewer_full_names_2006:
    print("Reviewer's Full Name:", full_name)
    print()

# Print the cleaned reviews
print("Cleaned Reviews:")
for review in cleaned_reviews_2006:
    print(review)

print(len(reviewer_full_names_2006))
print(len(cleaned_reviews_2006))

#Clean the reviewer_full_names_2006

### Clean the reviewers

In [None]:
reviewers_2006_final = []

# Define a regular expression pattern to capture the first full name until " By"
pattern = re.compile(r"^(.+?)(?=\s*By)")

# Iterate over each string in the list
for string in reviewer_full_names_2006:
    # Check if the pattern is found in the string
    match = re.search(pattern, string)
    if match:
        # If the pattern is found, use the matched group which captures the first name
        cleaned_string = match.group(1)
    else:
        # If the pattern is not found, keep the original string
        cleaned_string = string
    # Append the cleaned string to the list
    reviewers_2006_final.append(cleaned_string)

print(len(reviewers_2006_final))

# Print the cleaned strings
for cleaned_string in reviewers_2006_final:
    print(cleaned_string)

## Create Additional Column

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2006 = [2006] * 100

source_2006 = ['Pitchfork'] * 100

source_title_2006 = ['The Top 100 Tracks of 2006'] * 100

columns_list2006 = [artist_cleaned2006, songs2006, year_2006, cleaned_reviews_2006, reviewers_2006_final, rank_100, source_2006, source_title_2006]
for list in columns_list2006:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create DF 2006

In [None]:
columns_list2006 = [artist_cleaned2006, songs2006, year_2006, cleaned_reviews_2006, reviewer_full_names_2006, rank_100, source_2006, source_title_2006]
columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2006))

# Create a DataFrame from the dictionary
df2006 = pd.DataFrame(data_dict)

#df2006.to_csv('df2006.csv', index=False)


# 7 Pitchfork: The 100 Best Tracks of 2007

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url07 = "https://pitchfork.com/features/lists-and-guides/6752-top-100-tracks-of-2007/"
response = requests.get(url07)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
all_p = soup.find_all('p')
all_p


# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each <p> tag to extract and clean the review text
for tag in all_p:
    # Extract the review text
    review_text = tag.text.strip()
    
    # Check if the review starts with two or more underscores or non-alphanumeric characters
    if re.match(r'^[_\W]{2,}', review_text):
        # Remove the starting underscores or non-alphanumeric characters
        cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
    else:
        cleaned_review = review_text
    
    # Append the cleaned review to the list
    cleaned_reviews.append(cleaned_review)

# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

In [None]:
# Initialize a new list to store the filtered tags (#This list includes some MP3)
filtered_tags = []

# Iterate through each tag
for tag in cleaned_reviews:
    # Check if the tag does not start with "MP3"
    if not tag.startswith("MP3"):
        # If it doesn't start with "MP3", add it to the filtered list
        filtered_tags.append(tag)

# Display the filtered tags
for tag in filtered_tags:
    print(tag)

In [None]:
sublists = []
current_sublist = []

# Iterate through each review
for review in filtered_tags:
    # Check if the review starts with a digit
    if review[0].isdigit():
        # If it does, start a new sublist
        if current_sublist:
            sublists.append(current_sublist)
        current_sublist = [review]
    else:
        # If it doesn't, append to the current sublist
        current_sublist.append(review)

# Append the last sublist
if current_sublist:
    sublists.append(current_sublist)

#delete first and last element of sublist
sublists = sublists[1:-1]

# Print the sublists
for sublist in sublists:
    print(sublist)

len(sublists)

In [None]:
resulting_sublists = []


for sublist in sublists:
    combined = []
    #keep first element
    first_element = sublist[0]
    #concatenate the rest
    rest_element = "".join(sublist[1:])
    #combine the first and rest element
    combined.append(first_element)
    combined.append(rest_element)
    #append result to resulting_sublists
    resulting_sublists.append(combined)
    

resulting_sublists

## Extract artist_song and Reviewers

In [None]:
# Initialize lists to store the extracted elements
artist_song2007 = []
reviews2007 = []

# Iterate through each sublist
for sublist in resulting_sublists:
    # Extract the first element of the sublist (artist/song)
    artist_song = sublist[0]
    # Extract the second element of the sublist (review)
    review = sublist[1]
    # Append the extracted elements to their respective lists
    artist_song2007.append(artist_song)
    reviews2007.append(review)

# Display the extracted lists
print("Artist/Song 2007:")
print(artist_song2007)
print("\nReviews 2007:")
print(reviews2007)
print(len(artist_song2007))
print(len(reviews2007))

## Separate Artist and Song 

In [None]:
#There is one case where the song does not have " (Belle and Sebastian)
artists2007 = []
songs2007 = []
pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')

# Iterate through each song in the list
for song in artist_song2007:
    # Check if the song contains a "
    if '"' not in song:
        # If no ", split on ":"
        parts = re.split(pattern, song)
        # Extract the artist name and song title
        if len(parts) >= 2:
            artist = parts[0]
            song_title = parts[1]
    else:
        # If ", split on '"'
        parts = song.split('"')
        if len(parts) >= 2:
            # Extract the artist name
            artist = parts[0].strip()
            # Extract the song title
            song_title = parts[1].strip()
        
    # Append the artist and song title to their respective lists
    artists2007.append(artist)
    songs2007.append(song_title)

# Display the extracted lists
print("Artists:")
print(artists2007)
print("\nSongs:")
print(songs2007)
print(len(artists2007))
print(len(songs2007))

## Clean Artists

In [None]:
artist_cleaned2007 = []

# Iterate through each artist name in the artists2004 list
for artist in artists2007:
    # Remove the first four characters and trailing underscores
    cleaned_name = artist[4:].rstrip('_')
    print(cleaned_name)
    # Append the cleaned artist name to the artist_cleaned2004 list
    artist_cleaned2007.append(cleaned_name)

print(len(artist_cleaned2007))
# Displaying the cleaned artist names
#print("Cleaned artist names:")
#print(artist_cleaned2004)

## Extract Review and Reviewers

In [None]:
# Initialize lists to store reviewer names and cleaned reviews
reviewer_full_names_2007 = []
cleaned_reviews_2007 = []

# Iterate through each review in the reviews2004 list
for name in reviews2007:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', name)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        # Extract the reviewer's full name and append it to the list
        reviewer_full_names_2007.append(parts[1].strip())
        # Extract the review and append it to the list
        cleaned_reviews_2007.append(parts[0].strip())

# Print the extracted reviewer full names
for full_name in reviewer_full_names_2007:
    print("Reviewer's Full Name:", full_name)
    print()

# Print the cleaned reviews
print("Cleaned Reviews:")
for review in cleaned_reviews_2007:
    print(review)

print(len(reviewer_full_names_2007))
print(len(cleaned_reviews_2007))

#Clean the reviewer_full_names_2007

### Clean the reviewers

In [None]:
reviewers_2007_final = []

# Define a regular expression pattern to capture the first full name until " By"
pattern = re.compile(r"^(.+?)(?=\s*By)")

# Iterate over each string in the list
for string in reviewer_full_names_2007:
    # Check if the pattern is found in the string
    match = re.search(pattern, string)
    if match:
        # If the pattern is found, use the matched group which captures the first name
        cleaned_string = match.group(1)
    else:
        # If the pattern is not found, keep the original string
        cleaned_string = string
    # Append the cleaned string to the list
    reviewers_2007_final.append(cleaned_string)

print(len(reviewers_2007_final))

# Print the cleaned strings
for cleaned_string in reviewers_2007_final:
    print(cleaned_string)

## Create Additional Column

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2007 = [2007] * 100

source_2007 = ['Pitchfork'] * 100

source_title_2007 = ['The 100 Best Songs of 2007'] * 100

columns_list2007 = [artist_cleaned2007, songs2007, year_2007, cleaned_reviews_2007, reviewers_2007_final, rank_100, source_2007, source_title_2007]
for list in columns_list2007:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create DF 2007

In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2007))

# Create a DataFrame from the dictionary
df2007 = pd.DataFrame(data_dict)

#df2007.to_csv('df2007.csv', index=False)

# 8 Pitchfork: The 100 Best Tracks of 2008

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url08 = "https://pitchfork.com/features/lists-and-guides/7572-the-100-best-tracks-of-2008/"
response = requests.get(url08)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
all_p = soup.find_all('p')
all_p


# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each <p> tag to extract and clean the review text
for tag in all_p:
    # Extract the review text
    review_text = tag.text.strip()
    
    # Check if the review starts with two or more underscores or non-alphanumeric characters
    if re.match(r'^[_\W]{2,}', review_text):
        # Remove the starting underscores or non-alphanumeric characters
        cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
    else:
        cleaned_review = review_text
    
    # Append the cleaned review to the list
    cleaned_reviews.append(cleaned_review)

cleaned_reviews = cleaned_reviews[:-52]
cleaned_reviews = cleaned_reviews[4:]
# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

In [None]:
# Initialize a new list to store the filtered tags (#This list includes some MP3)
filtered_tags = []

# Iterate through each tag
for tag in cleaned_reviews:
    # Check if the tag does not start with "MP3"
    if not tag.startswith("MP3"):
        # If it doesn't start with "MP3", add it to the filtered list
        filtered_tags.append(tag)

# Display the filtered tags
for tag in filtered_tags:
    print(tag)

In [None]:
sublists = []
current_sublist = []

# Iterate through each review
for review in filtered_tags:
    # Check if the review starts with a digit
    if review[0].isdigit():
        # If it does, start a new sublist
        if current_sublist:
            sublists.append(current_sublist)
        current_sublist = [review]
    else:
        # If it doesn't, append to the current sublist
        current_sublist.append(review)

# Append the last sublist
if current_sublist:
    sublists.append(current_sublist)

# Print the sublists
for sublist in sublists:
    print(sublist)

len(sublists)

In [None]:
#Len = 102 there are two extra lines


# Combine the sublists at index 42 and 43
combined_sublist = sublists[39] + sublists[40]

# Remove the sublist at index 40
del sublists[40]

#Update the combined sublist at index 39
sublists[39] = combined_sublist
len(sublists)


In [None]:
# Combine the sublists at index 99 and 100
combined_sublist2 = sublists[99] + sublists[100]

# Remove the sublist at index 100
del sublists[100]

#Update the combined sublist at index 99
sublists[99] = combined_sublist2

# Display the updated concatenated_sublists
for sublist in sublists:
    print(sublist)

In [None]:
resulting_sublists = []


for sublist in sublists:
    combined = []
    #keep first element
    first_element = sublist[0]
    #concatenate the rest
    rest_element = "".join(sublist[1:])
    #combine the first and rest element
    combined.append(first_element)
    combined.append(rest_element)
    #append result to resulting_sublists
    resulting_sublists.append(combined)
    

print(len(resulting_sublists))

## Extract artist_song and Reviewers

In [None]:
# Initialize lists to store the extracted elements
artist_song2008 = []
reviews2008 = []

# Iterate through each sublist
for sublist in resulting_sublists:
    # Extract the first element of the sublist (artist/song)
    artist_song = sublist[0]
    # Extract the second element of the sublist (review)
    review = sublist[1]
    # Append the extracted elements to their respective lists
    artist_song2008.append(artist_song)
    reviews2008.append(review)

# Display the extracted lists
print("Artist/Song 2008:")
print(artist_song2008)
print("\nReviews 2008:")
print(reviews2008)
print(len(artist_song2008))
print(len(reviews2008))

## Separate Artist and Song 

In [None]:
#There is one case where the song does not have " (Belle and Sebastian)
artists2008 = []
songs2008 = []
pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')

# Iterate through each song in the list
for song in artist_song2008:
    # Check if the song contains a "
    if '"' not in song:
        # If no ", split on ":"
        parts = re.split(pattern, song)
        # Extract the artist name and song title
        if len(parts) >= 2:
            artist = parts[0]
            song_title = parts[1]
    else:
        # If ", split on '"'
        parts = song.split('"')
        if len(parts) >= 2:
            # Extract the artist name
            artist = parts[0].strip()
            # Extract the song title
            song_title = parts[1].strip()
        
    # Append the artist and song title to their respective lists
    artists2008.append(artist)
    songs2008.append(song_title)

# Display the extracted lists
print("Artists:")
print(artists2008)
print("\nSongs:")
print(songs2008)
print(len(artists2008))
print(len(songs2008))

## Clean Artists

In [None]:
artist_cleaned2008 = []

# Iterate through each artist name in the artists2004 list
for artist in artists2008:
    # Remove the first four characters and trailing underscores
    cleaned_name = artist[4:].rstrip('_')
    print(cleaned_name)
    # Append the cleaned artist name to the artist_cleaned2004 list
    artist_cleaned2008.append(cleaned_name)

print(len(artist_cleaned2008))
# Displaying the cleaned artist names
#print("Cleaned artist names:")
#print(artist_cleaned2004)

## Extract Review and Reviewers

In [None]:
# Initialize lists to store reviewer names and cleaned reviews
reviewer_full_names_2008= []
cleaned_reviews_2008 = []

# Iterate through each review in the reviews2004 list
for name in reviews2008:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', name)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        # Extract the reviewer's full name and append it to the list
        reviewer_full_names_2008.append(parts[1].strip())
        # Extract the review and append it to the list
        cleaned_reviews_2008.append(parts[0].strip())

# Print the extracted reviewer full names
for full_name in reviewer_full_names_2008:
    print("Reviewer's Full Name:", full_name)
    print()

# Print the cleaned reviews
print("Cleaned Reviews:")
for review in cleaned_reviews_2008:
    print(review)

print(len(reviewer_full_names_2008))
print(len(cleaned_reviews_2008))

### Clean the reviewers

In [None]:
reviewers_2008_final = []

# Define a regular expression pattern to capture the first full name until " By"
pattern = re.compile(r"^(.+?)(?=[_\s]*\sBy|[_\s]+$)")

# Iterate over each string in the list
for string in reviewer_full_names_2008:
    # Check if the pattern is found in the string
    match = re.search(pattern, string)
    if match:
        # If the pattern is found, use the matched group which captures the first name
        cleaned_string = match.group(1)
    else:
        # If the pattern is not found, keep the original string
        cleaned_string = string
    # Append the cleaned string to the list
    reviewers_2008_final.append(cleaned_string)

print(len(reviewers_2008_final))

# Print the cleaned strings
for cleaned_string in reviewers_2008_final:
    print(cleaned_string)

## Create Additional Column

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2008 = [2008] * 100

source_2008 = ['Pitchfork'] * 100

source_title_2008 = ['The 100 Best Tracks of 2008'] * 100

columns_list2008 = [artist_cleaned2008, songs2008, year_2008, cleaned_reviews_2008, reviewers_2008_final, rank_100, source_2008, source_title_2008]
for list in columns_list2008:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create DF 2008

In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2008))

# Create a DataFrame from the dictionary
df2008 = pd.DataFrame(data_dict)

#df2008.to_csv('df2008.csv', index=False)

# 9. Pitchfork: The Top 100 Tracks of 2009

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url09 = "https://pitchfork.com/features/lists-and-guides/7742-the-top-100-tracks-of-2009/"
response = requests.get(url09)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
all_p = soup.find_all('p')
all_p


# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each <p> tag to extract and clean the review text
for tag in all_p:
    # Extract the review text
    review_text = tag.text.strip()
    
    # Check if the review starts with two or more underscores or non-alphanumeric characters
    if re.match(r'^[_\W]{2,}', review_text):
        # Remove the starting underscores or non-alphanumeric characters
        cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
    else:
        cleaned_review = review_text
    
    # Append the cleaned review to the list
    cleaned_reviews.append(cleaned_review)

cleaned_reviews = cleaned_reviews[:-12]
cleaned_reviews = cleaned_reviews[8:]
# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

In [None]:
# Initialize a new list to store the filtered tags (#This list includes some MP3)
filtered_tags = []

# Iterate through each tag
for tag in cleaned_reviews:
    # Check if the tag does not start with "MP3"
    if not tag.startswith("MP3"):
        # If it doesn't start with "MP3", add it to the filtered list
        filtered_tags.append(tag)

# Display the filtered tags
for tag in filtered_tags:
    print(tag)

In [None]:
sublists = []
current_sublist = []

# Iterate through each review
for review in filtered_tags:
    # Check if the review starts with a digit
    if review[0].isdigit():
        # If it does, start a new sublist
        if current_sublist:
            sublists.append(current_sublist)
        current_sublist = [review]
    else:
        # If it doesn't, append to the current sublist
        current_sublist.append(review)

# Append the last sublist
if current_sublist:
    sublists.append(current_sublist)

# Print the sublists
for sublist in sublists:
    print(sublist)

len(sublists)

In [None]:
resulting_sublists = []


for sublist in sublists:
    combined = []
    #keep first element
    first_element = sublist[0]
    #concatenate the rest
    rest_element = "".join(sublist[1:])
    #combine the first and rest element
    combined.append(first_element)
    combined.append(rest_element)
    #append result to resulting_sublists
    resulting_sublists.append(combined)
    

resulting_sublists

## Extract artist_song and Reviewers

In [None]:
# Initialize lists to store the extracted elements
artist_song2009 = []
reviews2009 = []

# Iterate through each sublist
for sublist in resulting_sublists:
    # Extract the first element of the sublist (artist/song)
    artist_song = sublist[0]
    # Extract the second element of the sublist (review)
    review = sublist[1]
    # Append the extracted elements to their respective lists
    artist_song2009.append(artist_song)
    reviews2009.append(review)

# Display the extracted lists
print("Artist/Song 2009:")
print(artist_song2009)
print("\nReviews 2009:")
print(reviews2009)
print(len(artist_song2009))
print(len(reviews2009))

## Separate Artist and Song 

In [None]:
#There is one case where the song does not have " (Belle and Sebastian)
artists2009 = []
songs2009 = []
pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')

# Iterate through each song in the list
for song in artist_song2009:
    # Check if the song contains a "
    if '"' not in song:
        # If no ", split on ":"
        parts = re.split(pattern, song)
        # Extract the artist name and song title
        if len(parts) >= 2:
            artist = parts[0]
            song_title = parts[1]
    else:
        # If ", split on '"'
        parts = song.split('"')
        if len(parts) >= 2:
            # Extract the artist name
            artist = parts[0].strip()
            # Extract the song title
            song_title = parts[1].strip()
        
    # Append the artist and song title to their respective lists
    artists2009.append(artist)
    songs2009.append(song_title)

# Display the extracted lists
print("Artists:")
print(artists2009)
print("\nSongs:")
print(songs2009)
print(len(artists2009))
print(len(songs2009))

In [None]:
artists2009

## Clean Artists

In [None]:
artist_cleaned2009 = []

# Iterate through each artist name in the artists2004 list
for artist in artists2009:
    # Remove the first four characters and trailing underscores
    cleaned_name = artist[3:]
    cleaned_name = cleaned_name.strip().rstrip('_').strip()
    cleaned_name = cleaned_name.strip().lstrip('.').strip()
    print(cleaned_name)
    # Append the cleaned artist name to the artist_cleaned2004 list
    artist_cleaned2009.append(cleaned_name)

print(len(artist_cleaned2009))
# Displaying the cleaned artist names
#print("Cleaned artist names:")
#print(artist_cleaned2004)

## Extract Review and Reviewers

In [None]:
# Initialize lists to store reviewer names and cleaned reviews
reviewer_full_names_2009= []
cleaned_reviews_2009 = []

# Iterate through each review in the reviews2004 list
for name in reviews2009:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', name)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        # Extract the reviewer's full name and append it to the list
        reviewer_full_names_2009.append(parts[1].strip())
        # Extract the review and append it to the list
        cleaned_reviews_2009.append(parts[0].strip())

# Print the extracted reviewer full names
for full_name in reviewer_full_names_2009:
    print("Reviewer's Full Name:", full_name)
    print()

# Print the cleaned reviews
print("Cleaned Reviews:")
for review in cleaned_reviews_2009:
    print(review)

print(len(reviewer_full_names_2009))
print(len(cleaned_reviews_2009))

### Clean the reviewers

In [None]:
# reviewers_2008_final = []

# # Define a regular expression pattern to capture the first full name until " By"
# pattern = re.compile(r"^(.+?)(?=[_\s]*\sBy|[_\s]+$)")

# # Iterate over each string in the list
# for string in reviewer_full_names_2008:
#     # Check if the pattern is found in the string
#     match = re.search(pattern, string)
#     if match:
#         # If the pattern is found, use the matched group which captures the first name
#         cleaned_string = match.group(1)
#     else:
#         # If the pattern is not found, keep the original string
#         cleaned_string = string
#     # Append the cleaned string to the list
#     reviewers_2008_final.append(cleaned_string)

# print(len(reviewers_2008_final))

# # Print the cleaned strings
# for cleaned_string in reviewers_2008_final:
#     print(cleaned_string)

## Create Additional Column

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2009 = [2009] * 100

source_2009 = ['Pitchfork'] * 100

source_title_2009 = ['The Top 100 Tracks of 2009'] * 100

columns_list2009 = [artist_cleaned2009, songs2009, year_2009, cleaned_reviews_2009, reviewer_full_names_2009, rank_100, source_2009, source_title_2009]
for list in columns_list2009:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create DF 2009

In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2009))

# Create a DataFrame from the dictionary
df2009 = pd.DataFrame(data_dict)

#df2009.to_csv('df2009.csv', index=False)

# 10 Pitchfork: The Top 100 Tracks of 2010

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url10 = "https://pitchfork.com/features/lists-and-guides/7895-the-top-100-tracks-of-2010/"
response = requests.get(url10)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

## Get Artists and Songs

In [None]:
# Initialize the lists for artists and songs
artist_2010 = []
songs_10 = []

# Find all elements that could potentially contain artist or song names
all_elements = soup.find_all(['h2', 'p'])  # Searching for both h2 and p tags

# Define a regular expression pattern to identify artist names based on your new criteria
import re
pattern = re.compile(r'^\d{2,3}\.')

# Iterate over the found elements
for i in range(len(all_elements)):
    element = all_elements[i]
    text = element.get_text().strip()

    # Check if the text matches the pattern (two or three digits followed by a period)
    if pattern.match(text):
        artist_2010.append(text)

        # Assuming the next element is a song, check if it's within range and add it
        if i + 1 < len(all_elements):
            next_text = all_elements[i + 1].get_text().strip()
            songs_10.append(next_text)

# Optionally, print the lists to verify the contents
for i in range(max(len(artist_2010), len(songs_10))):
    artist = artist_2010[i] if i < len(artist_2010) else "No artist available"
    song = songs_10[i] if i < len(songs_10) else "No song available"
    print(f"Artist {i+1}: {artist}, Song: {song}")

# Print the total numbers
print("Total number of artists:", len(artist_2010))
print("Total number of songs:", len(songs_10))

## Reviews

In [None]:
all_p = soup.find_all('p')
all_p = all_p[9:-12]

In [None]:
all_p

In [None]:


for h2_tag in soup.find_all('h2'):
    next_p_tag = h2_tag.find_next_sibling('p')
    if next_p_tag:
        print(next_p_tag.text)
    else:
        print("No direct <p> sibling tag found after", h2_tag.text)

In [None]:
unique_paragraphs = set()

for h2_tag in soup.find_all('h2'):
    next_p_tag = h2_tag.find_next('p')
    if next_p_tag:
        unique_paragraphs.add(next_p_tag.text)

for paragraph in unique_paragraphs:
    print(paragraph)

In [None]:
print(len(unique_paragraphs))
    


In [None]:
reviews = []
exclude_embed_text = "Embed is unavailable."
pattern_start_with_digits = re.compile(r'^\d{2}\.')

# Use a flag to start or stop recording paragraphs based on encountering <hr> tags
recording = False
current_review = []

# Iterate over all elements in the body of the HTML
for element in soup.body.descendants:
    if element.name == 'hr':
        # If we're already recording a review, this <hr> signifies the end of the current review
        if recording and current_review:
            # Add the review to the list, join the paragraphs, and reset for the next review
            reviews.append(' '.join(current_review).strip())
            current_review = []
            if len(reviews) >= 100:  # Stop if we've collected 100 reviews
                break
        recording = not recording  # Toggle recording status when <hr> is encountered

    # If we're recording and the element is a paragraph, handle the exclusion cases
    elif recording and element.name == 'p':
        text = element.get_text(strip=True)
        if not (exclude_embed_text in text or pattern_start_with_digits.match(text)):
            current_review.append(text)

# After the loop, add the last review if it wasn't added already
if current_review:
    reviews.append(' '.join(current_review).strip())

# Print out all reviews collected
for index, review in enumerate(reviews):
    print(f"Review {index + 1}:\n{review}\n")

In [None]:
# This will hold all the reviews --> getting 91 results
reviews = []

# Regex pattern to match any <p> tags that should be excluded from the review
exclude_patterns = re.compile(r'Embed is unavailable\.|^==\s\$0|^\d{2}\.\s')

# Find all <hr> tags, and assume that they precede a review
hr_tags = soup.find_all('hr')

# Iterate over each <hr> tag
for hr_tag in hr_tags:
    # The content we want starts after the <hr> tag
    content_start = hr_tag.next_sibling

    # Storage for the current review
    current_review = []

    # Skip over anything that isn't a tag (like NavigableString elements)
    while content_start and not hasattr(content_start, 'name'):
        content_start = content_start.next_sibling

    # Collect content until the next <hr> tag is found
    while content_start and content_start.name != 'hr':
        if content_start.name == 'h2' or (content_start.name == 'p' and not exclude_patterns.search(content_start.text)):
            # Add the text of the <h2> or <p> tag to the current review
            current_review.append(content_start.get_text(strip=True))
        content_start = content_start.next_sibling

    # If we found any review content, join it into a single string and add it to the list of reviews
    if current_review:
        reviews.append(' '.join(current_review))

print(len(reviews))

# Now, reviews[] contains all the review text
for review in reviews:
    print(f"Review:\n{review}\n")

In [None]:
# This will hold all the reviews
reviews = []

# Regex pattern to match any <p> tags that start and end with quotation marks
quote_pattern = re.compile(r'^".*"$')

# Find all <hr> tags, which mark the start of a review
hr_tags = soup.find_all('hr')

# Iterate over each <hr> tag
for hr_tag in hr_tags:
    # Skip <hr> tags that don't have a <p> tag following them
    if not hr_tag.find_next_sibling('p'):
        continue

    # Get the next <p> tag after the <hr>
    p_tag = hr_tag.find_next_sibling('p')

    # Check if the next <p> tag starts and ends with a quotation mark
    if p_tag and quote_pattern.match(p_tag.get_text(strip=True)):
        # Collect the content from this <p> tag
        review_text = p_tag.get_text(strip=True)
        reviews.append(review_text)

        # Move to the next <p> tag in the review
        next_p_tag = p_tag.find_next_sibling('p')

        # Keep collecting <p> tags until you reach one that ends with a quotation mark
        while next_p_tag and not quote_pattern.match(next_p_tag.get_text(strip=True)):
            reviews[-1] += ' ' + next_p_tag.get_text(strip=True)
            next_p_tag = next_p_tag.find_next_sibling('p')

        # Add the final part of the review if it ends with a quotation mark
        if next_p_tag and quote_pattern.match(next_p_tag.get_text(strip=True)):
            reviews[-1] += ' ' + next_p_tag.get_text(strip=True)

# Now, reviews[] contains all the review text
for review in reviews:
    print(review + '\n')

In [None]:
all_hr = soup.find_all('hr')
all_hr

In [None]:
# This will hold all the reviews
reviews = []

# Regex patterns for unwanted paragraphs
exclude_embed_text = "Embed is unavailable."
#exclude_pattern = re.compile(r'^\d{2}\.\s|' + re.escape(exclude_embed_text))

# Find all <hr> tags which mark the beginning and end of each review section
hr_tags = soup.find_all('hr')

# Iterate over each <hr> tag to find the bounded review content
for i in range(len(hr_tags)-1):
    # Initialize the list to hold the paragraphs for the current review
    review_paragraphs = []

    # Get the first element after the current <hr> tag
    element = hr_tags[i].next_sibling

    # While we haven't reached the next <hr> tag and the element is not None, continue collecting <p> tags
    while element and element != hr_tags[i+1]:
        # Check if it's a paragraph, starts with a quote, and it's not an excluded paragraph
        if element.name == 'p' and element.text.startswith('"') and not exclude_pattern.search(element.text):
            review_paragraphs.append(element.text.strip())
        # Safely move to the next sibling
        element = element.next_sibling if element else None

    # Join the paragraphs together to form the full review text
    full_review_text = ' '.join(review_paragraphs).strip()
    if full_review_text:
        reviews.append(full_review_text)

# Output the collected reviews
for review in reviews:
    print(f"Review: {review}\n")

# 11 Pitchfork: The Top 100 Tracks of 2011

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url11 = "https://pitchfork.com/features/lists-and-guides/8726-the-top-100-tracks-of-2011/"
response = requests.get(url11)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
# Find all p tags
all_p = soup.find_all('p')

# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each <p> tag to extract and clean the review text
for tag in all_p:
    # Extract the review text
    review_text = tag.text.strip()
    
    # Check if the review text starts with "Embed is unavailable" or "Photo by"
    if not review_text.startswith("Embed is unavailable") and not re.match(r'^Photo by', review_text):
        # Check if the review starts with two or more underscores or non-alphanumeric characters
        if re.match(r'^[_\W]{2,}', review_text):
            # Remove the starting underscores or non-alphanumeric characters
            cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
        else:
            cleaned_review = review_text

        # Append the cleaned review to the list
        cleaned_reviews.append(cleaned_review)
cleaned_reviews = cleaned_reviews[:-12]
cleaned_reviews = cleaned_reviews[8:]
# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

In [None]:
sublists = []
current_sublist = []

# Iterate through each review
for review in cleaned_reviews:
    # Check if the review starts with a digit
    if review[0].isdigit():
        # If it does, start a new sublist
        if current_sublist:
            sublists.append(current_sublist)
        current_sublist = [review]
    else:
        # If it doesn't, append to the current sublist
        current_sublist.append(review)

# Append the last sublist
if current_sublist:
    sublists.append(current_sublist)

# Print the sublists
for sublist in sublists:
    print(sublist)

len(sublists)

In [None]:
resulting_sublists = []


for sublist in sublists:
    combined = []
    #keep first element
    first_element = sublist[0]
    #concatenate the rest
    rest_element = "".join(sublist[1:])
    #combine the first and rest element
    combined.append(first_element)
    combined.append(rest_element)
    #append result to resulting_sublists
    resulting_sublists.append(combined)
    

resulting_sublists

In [None]:
print(len(resulting_sublists))

#Len = 101 there is 1 extra lines


# Combine the sublists at index 8 and 9
combined_sublist = sublists[8] + sublists[9]

# Remove the sublist at index 9
del resulting_sublists[9]

#Update the combined sublist at index 8
resulting_sublists[8] = combined_sublist
print(len(resulting_sublists))

In [None]:
len(resulting_sublists)


## Extract artist_song and Reviewers

In [None]:
# Initialize lists to store the extracted elements
artist_song2011 = []
reviews2011 = []

# Iterate through each sublist
for sublist in resulting_sublists:
    # Extract the first element of the sublist (artist/song)
    artist_song = sublist[0]
    # Extract the second element of the sublist (review)
    review = sublist[1]
    # Append the extracted elements to their respective lists
    artist_song2011.append(artist_song)
    reviews2011.append(review)

# Display the extracted lists
print("Artist/Song 2011:")
print(artist_song2011)
print("\nReviews 2011:")
print(reviews2011)
print(len(artist_song2011))
print(len(reviews2011))

## Separate Artist and Song 

In [None]:
#There is one case where the song does not have " (Belle and Sebastian)
artists2011 = []
songs2011 = []
pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')

# Iterate through each song in the list
for song in artist_song2011:
    # Check if the song contains a "
    if '"' not in song:
        # If no ", split on ":"
        parts = re.split(pattern, song)
        # Extract the artist name and song title
        if len(parts) >= 2:
            artist = parts[0]
            song_title = parts[1]
    else:
        # If ", split on '"'
        parts = song.split('"')
        if len(parts) >= 2:
            # Extract the artist name
            artist = parts[0].strip()
            # Extract the song title
            song_title = parts[1].strip()
        
    # Append the artist and song title to their respective lists
    artists2011.append(artist)
    songs2011.append(song_title)

# Display the extracted lists
print("Artists:")
print(artists2011)
print("\nSongs:")
print(songs2011)
print(len(artists2011))
print(len(songs2011))

## Clean Artists

In [None]:
artist_cleaned2011 = []

# Iterate through each artist name in the artists2004 list
for artist in artists2011:
    # Remove the first four characters and trailing underscores
    cleaned_name = artist[3:]
    cleaned_name = cleaned_name.strip().rstrip('_').strip()
    cleaned_name = cleaned_name.strip().lstrip('.').rstrip(':').strip()
    print(cleaned_name)
    # Append the cleaned artist name to the artist_cleaned2004 list
    artist_cleaned2011.append(cleaned_name)

print(len(artist_cleaned2011))
# Displaying the cleaned artist names
#print("Cleaned artist names:")
#print(artist_cleaned2004)

## Extract Review and Reviewers

In [None]:
# Initialize lists to store reviewer names and cleaned reviews
reviewer_full_names_2011 = []
cleaned_reviews_2011 = []

# Iterate through each review in the reviews2004 list
for name in reviews2011:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', name)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        # Extract the reviewer's full name and append it to the list
        reviewer_full_names_2011.append(parts[1].strip())
        # Extract the review and append it to the list
        cleaned_reviews_2011.append(parts[0].strip())

# Print the extracted reviewer full names
for full_name in reviewer_full_names_2011:
    print("Reviewer's Full Name:", full_name)
    print()

# Print the cleaned reviews
print("Cleaned Reviews:")
for review in cleaned_reviews_2011:
    print(review)

print(len(reviewer_full_names_2011))
print(len(cleaned_reviews_2011))

### Clean the reviewers

In [None]:
reviewers_2011_final = []

# Define a regular expression pattern to capture the first full name until " By"
pattern = re.compile(r"^(.+?)(?=\s?Photo by|\sBy|$)")

# Iterate over each string in the list
for string in reviewer_full_names_2011:
    # Check if the pattern is found in the string
    match = re.search(pattern, string)
    if match:
        # If the pattern is found, use the matched group which captures the first name
        cleaned_string = match.group(1).strip()
    else:
        # If the pattern is not found, keep the original string
        cleaned_string = string
    # Append the cleaned string to the list
    reviewers_2011_final.append(cleaned_string)

# Print the length and cleaned strings
print(len(reviewers_2011_final))
for cleaned_string in reviewers_2011_final:
    print(cleaned_string)

## Create Additional Column

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2011 = [2011] * 100

source_2011 = ['Pitchfork'] * 100

source_title_2011 = ['The Top 100 Tracks of 2011'] * 100

columns_list2011 = [artist_cleaned2011, songs2011, year_2011, cleaned_reviews_2011, reviewers_2011_final, rank_100, source_2011, source_title_2011]
for list in columns_list2011:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create DF 2011

In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2011))

# Create a DataFrame from the dictionary
df2011 = pd.DataFrame(data_dict)

#df2011.to_csv('df2011.csv', index=False)

# 12 Pitchfork: The Top 100 Tracks of 2012

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url12 = "https://pitchfork.com/features/lists-and-guides/9015-the-top-100-tracks-of-2012/"
response = requests.get(url12)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
# Find all p tags
all_p = soup.find_all('p')

# List to store the cleaned reviews
cleaned_reviews2012 = []

# Iterate through each <p> tag to extract and clean the review text
for tag in all_p:
    # Extract the review text
    review_text = tag.text.strip()
    
    # Check if the review text starts with "Embed is unavailable" or "Photo by"
    if not review_text.startswith("Embed is unavailable") and not re.match(r'^Photo by', review_text):
        # Check if the review starts with two or more underscores or non-alphanumeric characters
        if re.match(r'^[_\W]{2,}', review_text):
            # Remove the starting underscores or non-alphanumeric characters
            cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
        else:
            cleaned_review = review_text

        # Append the cleaned review to the list
        cleaned_reviews2012.append(cleaned_review)

cleaned_reviews2012 = cleaned_reviews2012[:-6]
cleaned_reviews2012 = cleaned_reviews2012[4:]

# Display the cleaned reviews
for review in cleaned_reviews2012:
    print(review)

In [None]:
sublists = []
current_sublist = []

# Iterate through each review
for review in cleaned_reviews2012:
    # Check if the review starts with a digit
    if review[0].isdigit():
        # If it does, start a new sublist
        if current_sublist:
            sublists.append(current_sublist)
        current_sublist = [review]
    else:
        # If it doesn't, append to the current sublist
        current_sublist.append(review)

# Append the last sublist
if current_sublist:
    sublists.append(current_sublist)

# Print the sublists
for sublist in sublists:
    print(sublist)

len(sublists)

In [None]:
resulting_sublists = []


for sublist in sublists:
    combined = []
    #keep first element
    first_element = sublist[0]
    #concatenate the rest
    rest_element = "".join(sublist[1:])
    #combine the first and rest element
    combined.append(first_element)
    combined.append(rest_element)
    #append result to resulting_sublists
    resulting_sublists.append(combined)
    

resulting_sublists

## Extract artist_song and Reviewers

In [None]:
# Initialize lists to store the extracted elements
artist_song2012 = []
reviews2012 = []

# Iterate through each sublist
for sublist in resulting_sublists:
    # Extract the first element of the sublist (artist/song)
    artist_song = sublist[1]
    # Extract the second element of the sublist (review)
    review = sublist[2]
    # Append the extracted elements to their respective lists
    artist_song2012.append(artist_song)
    reviews2012.append(review)

# Display the extracted lists
print("Artist/Song 2011:")
print(artist_song2012)
print("\nReviews 2011:")
print(reviews2012)
print(len(artist_song2012))
print(len(reviews2012))

## Separate Artist and Song 

In [None]:
#There is one case where the song does not have " (Belle and Sebastian)
artists2011 = []
songs2011 = []
pattern = re.compile(r'(?<=[a-z])(?=[A-Z])')

# Iterate through each song in the list
for song in artist_song2011:
    # Check if the song contains a "
    if '"' not in song:
        # If no ", split on ":"
        parts = re.split(pattern, song)
        # Extract the artist name and song title
        if len(parts) >= 2:
            artist = parts[0]
            song_title = parts[1]
    else:
        # If ", split on '"'
        parts = song.split('"')
        if len(parts) >= 2:
            # Extract the artist name
            artist = parts[0].strip()
            # Extract the song title
            song_title = parts[1].strip()
        
    # Append the artist and song title to their respective lists
    artists2011.append(artist)
    songs2011.append(song_title)

# Display the extracted lists
print("Artists:")
print(artists2011)
print("\nSongs:")
print(songs2011)
print(len(artists2011))
print(len(songs2011))

## Clean Artists

In [None]:
artist_cleaned2011 = []

# Iterate through each artist name in the artists2004 list
for artist in artists2011:
    # Remove the first four characters and trailing underscores
    cleaned_name = artist[3:]
    cleaned_name = cleaned_name.strip().rstrip('_').strip()
    cleaned_name = cleaned_name.strip().lstrip('.').rstrip(':').strip()
    print(cleaned_name)
    # Append the cleaned artist name to the artist_cleaned2004 list
    artist_cleaned2011.append(cleaned_name)

print(len(artist_cleaned2011))
# Displaying the cleaned artist names
#print("Cleaned artist names:")
#print(artist_cleaned2004)

## Extract Review and Reviewers

In [None]:
# Initialize lists to store reviewer names and cleaned reviews
reviewer_full_names_2011 = []
cleaned_reviews_2011 = []

# Iterate through each review in the reviews2004 list
for name in reviews2011:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', name)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        # Extract the reviewer's full name and append it to the list
        reviewer_full_names_2011.append(parts[1].strip())
        # Extract the review and append it to the list
        cleaned_reviews_2011.append(parts[0].strip())

# Print the extracted reviewer full names
for full_name in reviewer_full_names_2011:
    print("Reviewer's Full Name:", full_name)
    print()

# Print the cleaned reviews
print("Cleaned Reviews:")
for review in cleaned_reviews_2011:
    print(review)

print(len(reviewer_full_names_2011))
print(len(cleaned_reviews_2011))

### Clean the reviewers

In [None]:
reviewers_2011_final = []

# Define a regular expression pattern to capture the first full name until " By"
pattern = re.compile(r"^(.+?)(?=\s?Photo by|\sBy|$)")

# Iterate over each string in the list
for string in reviewer_full_names_2011:
    # Check if the pattern is found in the string
    match = re.search(pattern, string)
    if match:
        # If the pattern is found, use the matched group which captures the first name
        cleaned_string = match.group(1).strip()
    else:
        # If the pattern is not found, keep the original string
        cleaned_string = string
    # Append the cleaned string to the list
    reviewers_2011_final.append(cleaned_string)

# Print the length and cleaned strings
print(len(reviewers_2011_final))
for cleaned_string in reviewers_2011_final:
    print(cleaned_string)

## Create Additional Column

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2011 = [2011] * 100

source_2011 = ['Pitchfork'] * 100

source_title_2011 = ['The Top 100 Tracks of 2011'] * 100

columns_list2011 = [artist_cleaned2011, songs2011, year_2011, cleaned_reviews_2011, reviewers_2011_final, rank_100, source_2011, source_title_2011]
for list in columns_list2011:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create DF 2011

In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2011))

# Create a DataFrame from the dictionary
df2011 = pd.DataFrame(data_dict)

#df2011.to_csv('df2011.csv', index=False)

# 13 Pitchfork: The Top 100 Tracks of 2013

In [None]:
url13 = "https://pitchfork.com/features/lists-and-guides/9288-the-top-100-tracks-of-2013/"
response = requests.get(url13)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
# Find all p tags
all_p = soup.find_all('p')

# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each <p> tag to extract and clean the review text
for tag in all_p:
    # Extract the review text
    review_text = tag.text.strip()
    
    # Check if the review text starts with "Embed is unavailable" or "Photo by"
    if not review_text.startswith("Embed is unavailable") and not re.match(r'^Photo by', review_text):
        # Check if the review starts with two or more underscores or non-alphanumeric characters
        if re.match(r'^[_\W]{2,}', review_text):
            # Remove the starting underscores or non-alphanumeric characters
            cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
        else:
            cleaned_review = review_text

        # Append the cleaned review to the list
        cleaned_reviews.append(cleaned_review)
cleaned_reviews = cleaned_reviews[:-12]
cleaned_reviews = cleaned_reviews[8:]
# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)




In [None]:
# Create sublists for each group of three elements
sublists = []
current_group = []

# Iterate through the <p> tags
for tag in soup.find_all('p'):
    text = tag.get_text(strip=True)  # Extract the text content of the <p> tag
    if text.isdigit():  # Check if the text starts with a digit
        if current_group:  # If current_group is not empty, add it to sublists
            sublists.append(current_group)
            current_group = []  # Reset current_group
    current_group.append(tag)  # Append the <p> tag

# Append the last group
if current_group:
    sublists.append(current_group)

sublists = sublists[1:]

# Print the sublists
for sublist in sublists:
    print([tag.get_text(strip=True) for tag in sublist])
    print()


In [None]:
len(sublists)

# 14 Pitchfork: The 100 Best Tracks of 2014

In [None]:
url14 = "https://pitchfork.com/features/lists-and-guides/9555-the-100-best-tracks-of-2014/"
response = requests.get(url14)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
# Find all p tags
all_p = soup.find_all('p')

# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each <p> tag to extract and clean the review text
for tag in all_p:
    # Extract the review text
    review_text = tag.text.strip()
    
    # Check if the review text starts with "Embed is unavailable" or "Photo by"
    if not review_text.startswith("Embed is unavailable") and not re.match(r'^Photo by', review_text):
        # Check if the review starts with two or more underscores or non-alphanumeric characters
        if re.match(r'^[_\W]{2,}', review_text):
            # Remove the starting underscores or non-alphanumeric characters
            cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
        else:
            cleaned_review = review_text

        # Append the cleaned review to the list
        cleaned_reviews.append(cleaned_review)
cleaned_reviews = cleaned_reviews[:-13]
cleaned_reviews = cleaned_reviews[5:]
# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)




In [None]:
# Create sublists for each group of three elements
sublists = []
current_group = []

# Iterate through the <p> tags
for tag in soup.find_all('p'):
    text = tag.get_text(strip=True)  # Extract the text content of the <p> tag
    if text.isdigit():  # Check if the text starts with a digit
        if current_group:  # If current_group is not empty, add it to sublists
            sublists.append(current_group)
            current_group = []  # Reset current_group
    current_group.append(tag)  # Append the <p> tag

# Append the last group
if current_group:
    sublists.append(current_group)

sublists = sublists[1:]

# Print the sublists
for sublist in sublists:
    print([tag.get_text(strip=True) for tag in sublist])
    print()


# 15 Pitchfork: The 100 Best Songs of 2015

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url15 = "https://pitchfork.com/features/lists-and-guides/9765-the-100-best-tracks-of-2015/"
response = requests.get(url15)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

## Get Artist

In [None]:
# Initialize a list to store artist names
artists2015 = []

# Find all artist names
artist_tags = soup.find_all("div", class_="heading-h1")
for artist_tag in artist_tags:
    artist_name = artist_tag.get_text(strip=True)
    artists2015.append(artist_name)

# Print the extracted artist names
for artist in artists2015:
    print("Artist:", artist)

artists2015 = artists2015[1:]

len(artists2015)

## Get Songs

In [None]:
# Initialize a list to store song titles
songs2015 = []

# Find all song titles
artist_tags = soup.find_all("div", class_="heading-h1")
for artist_tag in artist_tags:
    next_sibling = artist_tag.find_next_sibling("h2")
    if next_sibling:
        song_title = next_sibling.get_text(strip=True)
        songs2015.append(song_title)

songs2015 = songs2015[1:]

print(len(songs2015))
songs2015[:10]

## Get Reviews

In [None]:
# Find all p tags
all_p = soup.find_all('p')

# List to store the cleaned reviews
cleaned_reviews2015 = []

# Iterate through each <p> tag to extract and clean the review text
for tag in all_p:
    # Extract the review text
    review_text = tag.text.strip()
    
    # Check if the review text starts with "Embed is unavailable" or "Photo by"
    if not review_text.startswith("Embed is unavailable") and not re.match(r'^Photo by', review_text):
        # Check if the review starts with two or more underscores or non-alphanumeric characters
        if re.match(r'^[_\W]{2,}', review_text):
            # Remove the starting underscores or non-alphanumeric characters
            cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
        else:
            cleaned_review = review_text

        # Append the cleaned review to the list
        cleaned_reviews2015.append(cleaned_review)
cleaned_reviews2015 = cleaned_reviews2015[:-12]
cleaned_reviews2015 = cleaned_reviews2015[8:]
# Display the cleaned reviews
for review in cleaned_reviews2015:
    print(review)

In [None]:
reviews_2015 = []  # This will store the list of sublists
current_list = []  # To collect elements for the current sublist

# Variable to track if we're within the range to start adding to the sublist
collecting = False

# Iterate over each element in the list
for element in cleaned_reviews2015:
    # Check if the element is a numeric string and within the specified range
    if element.isdigit():
        if collecting:
            # If we are already collecting, it means this number is a new trigger,
            # so we add the current list to reviews_2015 and start a new one
            reviews_2015.append(current_list)
            current_list = []  # Reset current_list for the next group
        # Start or restart collecting elements
        collecting = True
    elif collecting and element.strip():  # Check if element is not empty
        # Add the element to the current sublist only if we're between numbers
        current_list.append(element)

# Add the last sublist if it contains any elements
if current_list:
    reviews_2015.append(current_list)

# Output the results
print("Sublists captured:", reviews_2015)

In [None]:
len(reviews_2015)

In [None]:
for element in reviews_2015:
    print(len(element))
    print(element)

In [None]:
reviews_2015_2 = []

for element in reviews_2015:
    if len(element) > 1:
        element = element[:-1]
        reviews_2015_2.append(element)
    else:
        reviews_2015_2.append(element)
    print(len(element))
    print(element)

## Extract Reviewers

In [None]:
# Assuming reviews_2015 is your list of sublists
reviewers2015 = []  # This will store the reviewer names
reviews_only_2015 = []  # This will store the review parts of each sublist

for sublist in reviews_2015_2:
    if sublist:  # Ensure the sublist is not empty
        last_element = sublist[-1]
        # Reverse the string to search from the end
        reversed_element = last_element[::-1]

        # Regex to find the reviewer's name after a dash, not preceded by a word character
        match = re.search(r"(.+?)\s*[-—](?![a-zA-Z])(?<![a-zA-Z])", reversed_element)
        if match:
            # Reverse the captured name back to its original order
            reviewer = match.group(1)[::-1].strip()
            # Also, get the part of the last element before the reviewer's name
            index_of_dash = last_element[::-1].find(match.group(0))  # Find where the matched group starts in the reversed string
            review_text = last_element[:len(last_element) - index_of_dash - len(match.group(0))].strip()
        else:
            reviewer = "Reviewer name not found"
            review_text = last_element  # If no reviewer, consider the entire last element as review text
        
        reviewers2015.append(reviewer)
        # Create a new sublist with the original elements up to the last one, replacing the last with the review text
        reviews_only_2015.append(sublist[:-1] + [review_text])

### Concatenate the reviews

In [None]:
reviews_2015_final = []  # This will store the final concatenated strings

# Concatenate each sublist in reviews_only_2015 into a single string
for sublist in reviews_only_2015:
    concatenated_review = ' '.join(sublist)  # Concatenate elements into one string
    reviews_2015_final.append(concatenated_review)

In [None]:
print(reviews_2015_final[:5])
print(len(reviews_2015_final))

## Create Additional Columns

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2015 = [2015] * 100

source_2015 = ['Pitchfork'] * 100

source_title_2015 = ['The 100 Best Songs of 2015'] * 100

columns_list2015 = [artists2015, songs2015, year_2015, reviews_2015_final, reviewers2015, rank_100, source_2015, source_title_2015]
for list in columns_list2015:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create DF 2015

In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2015))

# Create a DataFrame from the dictionary
df2015 = pd.DataFrame(data_dict)

#df2015.to_csv('df2015.csv', index=False)

# 16. Pitchfork: The 100 Best Songs of 2016

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url16 = "https://pitchfork.com/features/lists-and-guides/9981-the-100-best-songs-of-2016/"
response = requests.get(url16)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

## Get Artists

In [None]:
# Initialize a list to store artist names
artists2016 = []

# Find all artist names
artist_tags = soup.find_all("div", class_="heading-h1")
for artist_tag in artist_tags:
    artist_name = artist_tag.get_text(strip=True)
    artists2016.append(artist_name)

# Print the extracted artist names
for artist in artists2016:
    print("Artist:", artist)

len(artists2016)

## Get Songs

In [None]:
# Initialize a list to store song titles
songs2016 = []

# Find all song titles
artist_tags = soup.find_all("div", class_="heading-h1")
for artist_tag in artist_tags:
    next_sibling = artist_tag.find_next_sibling("h2")
    if next_sibling:
        song_title = next_sibling.get_text(strip=True)
        songs2016.append(song_title)

# Print the extracted song titles
for song in songs2016:
    print("Song Title:", song)
    
print(len(songs2016))

## Get Reviews

In [None]:
article_chunk = soup.find('div',class_='ArticlePageChunks-fLyCVG')
artciles_grouped = article_chunk.find_all(['h2','p','h3'])

reviews = []
article_no = 0
has_started = False

for el in artciles_grouped:
    if el.name == 'h2':
        print(article_no)
        reviews.append([el.text])
        article_no+=1
        has_started = True
    elif el.name != 'h2' and has_started and el.text != '':
        reviews[article_no-1].append(el.text)
    
reviews

In [None]:
for i, r in enumerate(reviews):
    print(i, r[1], r[-1])

In [None]:
#Clean the reviews

reviews_cleaned_2016 = []

for list in reviews:
    new_list = list[2:-1]
    concatenated_new_list = " ".join(new_list)
    reviews_cleaned_2016.append(concatenated_new_list)

print(len(reviews_cleaned_2016))
reviews_cleaned_2016


### Extract the Reviewer

In [None]:
# Initialize lists to store review text and reviewer names
review_texts_2016 = []
reviewers_2016 = []

# Iterate through each review
for review in reviews_cleaned_2016:
    # Split the review based on the dash or long dash
    parts = review.rsplit("–", 1)
    if len(parts) == 2:
        review_text, reviewer = parts
        review_texts_2016.append(review_text.strip())
        reviewers_2016.append(reviewer.strip())
    else:
        # If no reviewer is found, add "Unknown" as the reviewer
        review_texts_2016.append(review.strip())
        reviewers_2016.append("Unknown")

# Print review text and reviewer names
for review_text, reviewer in zip(review_texts_2016, reviewers_2016):
    print(f"Review Text: {review_text}")
    print(f"Reviewer: {reviewer}\n")

## Create Additional Columns

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2016 = [2016] * 100

source_2016 = ['Pitchfork'] * 100

source_title_2016 = ['The 100 Best Songs of 2016'] * 100

columns_list2016 = [artists2016, songs2016, year_2016, review_texts_2016, reviewers_2016, rank_100, source_2016, source_title_2016]
for list in columns_list2016:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create DF 2016


In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2016))

# Create a DataFrame from the dictionary
df2016 = pd.DataFrame(data_dict)

df2016.to_csv('df2016.csv', index=False)

### Export to SQL

In [None]:
# import pandas as pd
# import os
# from sqlalchemy import create_engine , text

# pw_raw =os.getenv('mysql_pass')
# connection_string = 'mysql+pymysql://root:' + pw_raw + '@localhost:3306/'
# engine = create_engine(connection_string)

# with engine.connect() as conn:
#     conn.execute(text(f"CREATE DATABASE IF NOT EXISTS final_project"))

# df2016.to_sql('df_ptfk_2016', engine, 'final_project', if_exists='replace', index=False)

# 17. Pitchfork: The 100 Best Songs of 2017

In [None]:
url17 = "https://pitchfork.com/features/lists-and-guides/the-100-best-songs-of-2017/"
response = requests.get(url17)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

## Get Songs

In [None]:
# Initialize a list to store song titles
songs2017 = []

# Find all song titles
artist_tags = soup.find_all("hr")
for artist_tag in artist_tags:
    next_sibling = artist_tag.find_next_sibling("h2")
    if next_sibling:
        song_title = next_sibling.get_text(strip=True)
        songs2017.append(song_title)

# Print the extracted song titles
for song in songs2017:
    print("Song Title:", song)
    
print(len(songs2017))

In [None]:
additional_song = 'Charli XCX: "Boys"'
songs2017.insert(90,additional_song)

print(songs2017[91])

### Extract the songs

In [None]:
# Initialize an empty list to store the cleaned song titles
songs2017_cleaned = []

# Loop through each element in the original list
for song in songs2017:
    # Split the string at the first occurrence of ":"
    parts = song.split(":", 1)
    
    # Check if the split resulted in at least two parts
    if len(parts) > 1:
        # Append the second part (trimmed of any leading/trailing whitespace) to the cleaned list
        songs2017_cleaned.append(parts[1].strip())

# Print the cleaned list of song titles
print(len(songs2017_cleaned))

In [None]:
songs2017_cleaned

In [None]:
# # Find all p tags
# all_p = soup.find_all('p')

# # List to store the cleaned reviews
# cleaned_reviews = []

# # Iterate through each <p> tag to extract and clean the review text
# for tag in all_p:
#     # Extract the review text
#     review_text = tag.text.strip()
    
#     # Check if the review text starts with "Embed is unavailable" or "Photo by"
#     if not review_text.startswith("Embed is unavailable") and not re.match(r'^Photo by', review_text):
#         # Check if the review starts with two or more underscores or non-alphanumeric characters
#         if re.match(r'^[_\W]{2,}', review_text):
#             # Remove the starting underscores or non-alphanumeric characters
#             cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
#         else:
#             cleaned_review = review_text

#         # Append the cleaned review to the list
#         cleaned_reviews.append(cleaned_review)
# cleaned_reviews = cleaned_reviews[:-5]
# cleaned_reviews = cleaned_reviews[5:]
# # Display the cleaned reviews
# for review in cleaned_reviews:
#     print(review)

In [None]:
# # Extract ranking
# ranking_div = soup.find('div', class_='heading-h3')
# ranking = ranking_div.get_text(strip=True)

# # Extract artist: song
# artist_song_h2 = soup.find('h2')
# artist_song = artist_song_h2.get_text(strip=True)

# # Extract review
# review_p_tags = soup.find_all('p')
# review = review_p_tags[6].get_text(strip=True)

# # Print results
# print("Ranking:", ranking)
# print("Artist:Song:", artist_song)
# print("Review:", review)

In [None]:
# Find all <p> tags that do not contain record labels
relevant_p_tags = [p for p in soup.find_all('p') if not p.find_parent(class_='CaptionWrapper-jSZdqE')]

# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each relevant <p> tag to extract and clean the review text
for tag in relevant_p_tags:
    # Extract the review text
    review_text = tag.get_text(strip=True)
    
    # Check if the review text starts with "Embed is unavailable" or "Photo by"
    if not review_text.startswith("Embed is unavailable") and not re.match(r'^Photo by', review_text):
        # Check if the review starts with two or more underscores or non-alphanumeric characters
        if re.match(r'^[_\W]{2,}', review_text):
            # Remove the starting underscores or non-alphanumeric characters
            cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
        else:
            cleaned_review = review_text

        # Append the cleaned review to the list
        cleaned_reviews.append(cleaned_review)

cleaned_reviews = cleaned_reviews[5:]
cleaned_reviews = cleaned_reviews[:-12]


# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

In [None]:
# Initialize variables to store concatenated reviews
concatenated_reviews = []
current_review = ""

# Iterate through the input list
for item in cleaned_reviews:
    # Check if the item starts with "Listen:"
    if item.startswith("Listen:"):
        # If current_review is not empty, add it to concatenated_reviews
        if current_review:
            concatenated_reviews.append(current_review.strip())
            current_review = ""
    else:
        # Append the item to the current_review
        current_review += item + " "

# Print the concatenated reviews
for review in concatenated_reviews:
    print(review)

In [None]:
#Extract the reviewer:

# Initialize a list to store reviewer names
reviewer_full_names = []

# Iterate through each review in the list
for review in concatenated_reviews:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', review)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        # Extract the reviewer's full name
        reviewer_full_name = parts[1].strip()
        # Append the reviewer's full name to the list
        reviewer_full_names.append(reviewer_full_name)

# Print the extracted reviewer full names
for full_name in reviewer_full_names:
    print("Reviewer's Full Name:", full_name)
    print()

len(reviewer_full_names)


In [None]:
# Initialize a counter for elements starting with "Listen:"
listen_count = 0

# Iterate through the cleaned_reviews list
for item in cleaned_reviews:
    # Check if the item starts with "Listen:"
    if item.startswith("Listen:"):
        # Increment the counter
        listen_count += 1

# Print the total count of elements starting with "Listen:"
print("Total elements starting with 'Listen:' =", listen_count)

## Initialize variables to store artists and songs

In [None]:
# Initialize variables to store artists and songs
artists_songs = []


# Iterate through the input list
for item in cleaned_reviews:
    # Check if the item starts with "Listen:"
    if item.startswith("Listen:"):
        # Split the item based on the "," delimiter
        artists_songs.append(item)


# Print the extracted artists and songs
for artist in artists_songs:
    print("Artist_song:", artist)

print(len(artists_songs))

## Prepare Artists and Songs

In [None]:
# Initialize variables to store artists and songs
artists = []
songs = [] ## WE DONT CARE ABOUT THIS SONGS IT DOES NOT HAVE THE FULL NAME

# Iterate through the list of artist, song pairs
for artist_song in artists_songs:
    # Use regex to split the artist and song
    match = re.match(r"^Listen:(.*?), “(.*?)”", artist_song)
    if match:
        artist, song = match.groups()
        # Append the artist and song to their respective lists
        artists.append(artist.strip())
        songs.append(song.strip())
    else:
        # Try splitting on the first occurrence of '"'
        parts = artist_song.split('"', 1)
        if len(parts) == 2:
            artist = parts[0].replace("Listen:", "").strip()
            song = parts[1].strip()
            # Append the artist and song to their respective lists
            artists.append(artist)
            songs.append(song)
        else:
            # Try splitting on the first occurrence of ':'
            parts = artist_song.split(':', 1)
            if len(parts) == 2:
                artist = parts[1].strip().split("“", 1)[0].strip()
                song = parts[1].strip().split("“", 1)[1].strip()
                # Append the artist and song to their respective lists
                artists.append(artist)
                songs.append(song)
            else:
                print("Entry not matched by regex:", artist_song)

## Create Additional Columns

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2017 = [2017] * 100

source_2007 = ['Pitchfork'] * 100

source_title_2007 = ['The 100 Best Songs of 2017'] * 100

## Create the DF

In [None]:
columns_list = [artists, songs2017_cleaned, year_2017, concatenated_reviews, reviewer_full_names, rank_100, source_2007, source_title_2007]
for list in columns_list:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

In [None]:
df2017 = pd.DataFrame({
    'Artist': artists,
    'Song': songs2017_cleaned,
    'Year': year_2017,
    'Review': concatenated_reviews,
    'Reviewer': reviewer_full_names,
    'Ranking': rank_100,
    'Source': source_2007,
    'Source Title': source_title_2007
})

#df2017.to_csv('df2017.csv', index=False)

In [None]:
display(df2017.head(50))

# Pitchfork: The 100 Best Songs of 2018

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url18 = "https://pitchfork.com/features/lists-and-guides/the-100-best-songs-of-2018/"
response = requests.get(url18)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

## Prepare Reviews

In [None]:
# Find all <p> tags that do not contain record labels
relevant_p_tags = [p for p in soup.find_all('p') if not p.find_parent(class_='CaptionWrapper-jSZdqE')]

# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each relevant <p> tag to extract and clean the review text
for tag in relevant_p_tags:
    # Extract the review text
    review_text = tag.get_text(strip=True)
    
    # Check if the review text starts with "Embed is unavailable" or "Photo by"
    if not review_text.startswith("Embed is unavailable") and not re.match(r'^Photo by', review_text):
        # Check if the review starts with two or more underscores or non-alphanumeric characters
        if re.match(r'^[_\W]{2,}', review_text):
            # Remove the starting underscores or non-alphanumeric characters
            cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
        else:
            cleaned_review = review_text

        # Append the cleaned review to the list
        cleaned_reviews.append(cleaned_review)

cleaned_reviews = cleaned_reviews[5:]
cleaned_reviews = cleaned_reviews[:-12]


# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

## Concatenate Reviews

In [None]:
# Initialize variables to store concatenated reviews
concatenated_reviews = []
current_review = ""

# Iterate through the input list
for item in cleaned_reviews:
    # Check if the item starts with "Listen:"
    if item.startswith("Listen:"):
        # If current_review is not empty, add it to concatenated_reviews
        if current_review:
            concatenated_reviews.append(current_review.strip())
            current_review = ""
    else:
        # Append the item to the current_review
        current_review += item + " "

# Print the concatenated reviews
for review in concatenated_reviews:
    print(review)

print(len(concatenated_reviews))
#100 results

## Extract Reviewers

In [None]:
#Extract the reviewer:

# Initialize a list to store reviewer names
reviewer_full_names2018 = []
review_full2018 = []

# Iterate through each review in the list
for review in concatenated_reviews:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', review)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        # Extract the reviewer's full name
        review_full = parts[0].strip()
        review_full2018.append(review_full)
        reviewer_full_name = parts[1].strip()
        # Append the reviewer's full name to the list
        reviewer_full_names2018.append(reviewer_full_name)

# Print the extracted reviewer full names
for full_name in reviewer_full_names2018:
    print("Reviewer's Full Name:", full_name)
    print()

print(len(reviewer_full_names2018))
print(len(review_full2018))


## Prepare Artists and Songs

In [None]:
# Initialize variables to store artists and songs
artists_songs = []


# Iterate through the input list
for item in cleaned_reviews:
    # Check if the item starts with "Listen:"
    if item.startswith("Listen:"):
        # Split the item based on the "," delimiter
        artists_songs.append(item)


# Print the extracted artists and songs
for artist_song in artists_songs:
    print("Artist_song:", artist_song)

print(len(artists_songs))

In [None]:
artists_songs

## Initialize Variables to Store Artists and Songs

In [None]:
artists2018 = []
songs2018 = []

# Iterate through the list of artist, song pairs
for artist_song in artists_songs:
    # Use regex to capture both the artist and song, including possible features after the song
    match = re.match(r"^Listen:(.*?), “(.*?)(”(?: \[.*\])?)$", artist_song)
    if match:
        artist, song, extra = match.groups()
        # Combine the song with any extra text (like featured artists)
        full_song = f"{song}{extra}"
        # Append the artist and song to their respective lists
        artists2018.append(artist.strip())
        songs2018.append(full_song.strip())
    else:
        # Handle cases where the regex does not match
        print(f"Failed to match: {artist_song}")

# Print results to verify
for artist, song in zip(artists2018, songs2018):
    print(f"Artist: {artist}, Song: {song}")

## Create Additional Columns

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2018 = [2018] * 100

source_2018 = ['Pitchfork'] * 100

source_title_2018 = ['The 100 Best Songs of 2018'] * 100


### Check the Length

In [None]:
columns_list2018 = [artists2018, songs2018, year_2018, review_full2018, reviewer_full_names2018, rank_100, source_2018, source_title_2018]
for list in columns_list2018:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create the DF

In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2018))

# Create a DataFrame from the dictionary
df2018 = pd.DataFrame(data_dict)

#df2018.to_csv('df_raw/webscraping/df2018.csv', index=False)


# Pitchfork: The 100 Best Songs of 2019

In [None]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url19 = "https://pitchfork.com/features/lists-and-guides/best-songs-2019/"
response = requests.get(url19)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

## Prepare Reviews

In [None]:
# Find all <p> tags that do not contain record labels
relevant_p_tags = [p for p in soup.find_all('p') if not p.find_parent(class_='CaptionWrapper-jSZdqE')]

# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each relevant <p> tag to extract and clean the review text
for tag in relevant_p_tags:
    # Extract the review text
    review_text = tag.get_text(strip=True)
    
    # Check if the review text starts with "Embed is unavailable" or "Photo by"
    if not review_text.startswith("Embed is unavailable") and not re.match(r'^Photo by', review_text):
        # Check if the review starts with two or more underscores or non-alphanumeric characters
        if re.match(r'^[_\W]{2,}', review_text):
            # Remove the starting underscores or non-alphanumeric characters
            cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
        else:
            cleaned_review = review_text

        # Append the cleaned review to the list
        cleaned_reviews.append(cleaned_review)

cleaned_reviews = cleaned_reviews[6:]
cleaned_reviews = cleaned_reviews[:-12]


# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

## Concatenate Reviews

In [None]:
# Initialize variables to store concatenated reviews
concatenated_reviews = []
current_review = ""

# Iterate through the input list
for item in cleaned_reviews:
    # Check if the item starts with "Listen:"
    if item.startswith("Listen:"):
        # If current_review is not empty, add it to concatenated_reviews
        if current_review:
            concatenated_reviews.append(current_review.strip())
            current_review = ""
    else:
        # Append the item to the current_review
        current_review += item + " "

# Print the concatenated reviews
for review in concatenated_reviews:
    print(review)

print(len(concatenated_reviews))
#100 results

## Extract Reviewers

In [None]:
#Extract the reviewer:

# Initialize a list to store reviewer names
reviewer_full_names2019 = []
reviews_full2019 = []

# Iterate through each review in the list
for review in concatenated_reviews:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', review)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        reviews_full = parts[0].strip()
        reviews_full2019.append(reviews_full)
        # Extract the reviewer's full name
        reviewer_full_name = parts[1].strip()
        # Append the reviewer's full name to the list
        reviewer_full_names2019.append(reviewer_full_name)

print(len(reviewer_full_names2019))
print(len(reviews_full2019))


## Prepare Artist and Songs

In [None]:
# Initialize variables to store artists and songs
artists_songs = []


# Iterate through the input list
for item in cleaned_reviews:
    # Check if the item starts with "Listen:"
    if item.startswith("Listen:"):
        # Split the item based on the "," delimiter
        artists_songs.append(item)


# Print the extracted artists and songs
for artist in artists_songs:
    print("Artist_song:", artist)

print(len(artists_songs))

## Initialize variables to store artists and songs

In [None]:
artists2019 = []
songs2019 = []

# Iterate through the list of artist, song pairs
for artist_song in artists_songs:
    # Use regex to capture both the artist and song, including possible features after the song
    match = re.match(r"^Listen:(.*?), “(.*?)(”(?: \[.*\])?)$", artist_song)
    if match:
        artist, song, extra = match.groups()
        # Combine the song with any extra text (like featured artists)
        full_song = f"{song}{extra}"
        # Append the artist and song to their respective lists
        artists2019.append(artist.strip())
        songs2019.append(full_song.strip())
    else:
        # Handle cases where the regex does not match
        print(f"Failed to match: {artist_song}")

# Print results to verify
for artist, song in zip(artists2019, songs2019):
    print(f"Artist: {artist}, Song: {song}")

## Create Additional Columns

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2019 = [2019] * 100

source_2019 = ['Pitchfork'] * 100

source_title_2019 = ['The 100 Best Songs of 2019'] * 100

### Check the Length

In [None]:
columns_list2019 = [artists2019, songs2019, year_2019, reviews_full2019, reviewer_full_names2019, rank_100, source_2019, source_title_2019]
for list in columns_list2019:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create the DF 2019

In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2019))

# Create a DataFrame from the dictionary
df2019 = pd.DataFrame(data_dict)

#df2019.to_csv('df_raw/webscraping/df2019.csv', index=False)


# Pitchfork: The 100 Best Songs of 2020

In [97]:
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [98]:
url20 = "https://pitchfork.com/features/lists-and-guides/best-songs-2020/"
response = requests.get(url20)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <title>
   The 100 Best Songs of 2020 | Pitchfork
  </title>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="no" name="msapplication-tap-highlight"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="Condé Nast" name="author"/>
  <meta content="Copyright (c) Condé Nast 2024" name="copyright"/>
  <meta content="The tracks that defined this bizarre year, featuring Megan Thee Stallion, the Weeknd, Christine and the Queens, Noname, Waxahatchee, and more" name="description"/>
  <meta content="5fc7ab6dee65027d9ed930c6" name="id"/>
  <meta content="best of 2020" name="keywords"/>
  <meta content="best of 2020" name="news_keywords"/>
  <meta content="index, follow, noarchive, max-image-preview:large" name="robots"/>
  <meta content="article" name="content-type"/>
  <meta content="5fc7ab6dee65027d9ed930c6" name="parsely-post-id"/>
  <meta content='{"descriptio

## Reviews and Reviewers

In [99]:
def get_reviews(soup):
    reviews = []
    review_parts = []
    skip_next_p = False  # Flag to skip the next <p> if it's considered a label

    for element in soup.descendants:
        if element.name == 'span' and 'caption__text' in element.get('class', []):
            # The next <p> under this <span> should be skipped because it contains the record label
            skip_next_p = True
        elif element.name == 'p':
            if skip_next_p:
                # Skip this <p> because it was flagged as a record label
                skip_next_p = False
                continue
            # Reset the flag in case it's not the record label <p>
            skip_next_p = False
            
            text = element.get_text()
            # Collect the text if it's not the 'Listen:' part
            if 'Listen:' not in text:
                review_parts.append(text)
            else:
                # Reached the end of a review, join the parts and reset for the next review
                review_parts.append(text)
                full_review = ' '.join(review_parts).strip()
                reviews.append(full_review)
                review_parts = []
        elif element.name == 'strong' and element.get_text().isdigit():
            # A new review is starting, capture the previous one if there's any
            if review_parts:
                full_review = ' '.join(review_parts).strip()
                reviews.append(full_review)
                review_parts = []

    return reviews

# Get the reviews
reviews2020 = get_reviews(soup)

# Now you can use the 'reviews' list for any other operations or functions
for review in reviews2020:
    print(review)
    print('---')  # Separator between reviews
    

Find anything you save across the site in your account  Find anything you save across the site in your account  By Pitchfork Whether your year was more “people, I’ve been sad,” “certified freak, seven days a week,” or “fetch the bolt cutters,” the best songs of 2020 provided a brief escape from the turmoil outside our windows. They offered a comforting shoulder to cry on, a lit match to long-simmering rage, and a temporary substitute for the dancefloors and mosh pits the pandemic stole from us. Until the day we get to gather again in sweaty clubs, packed basements, and sold-out arenas, we’ll keep turning to these 100 tracks to soundtrack our lives. Listen to selections from this list on our Spotify playlist and Apple Music playlist. Check out all of Pitchfork’s 2020 wrap-up coverage here. After years of playing in emo bands and releasing candy-coated electropop, Ela Minus splits the difference on “dominique.” The standout from her debut LP, acts of rebellion, is a depressive ode to sle

In [100]:
#Some manually cleanning
to_replace = 'After years of playing in emo bands and releasing candy-coated electropop, Ela Minus splits the difference on “dominique.” The standout from her debut LP, acts of rebellion, is a depressive ode to sleeping all day and never leaving the house set to bright, buoyant melodies. Even as she teeters on the edge of the abyss, wasting away with coffee and liquor as her only companions, she finds humor in the downward spiral: “I should probably eat something that’s not liquid,” she deadpans. Recorded more than a year before much of humanity was sheltering in place, its themes of isolation and delirium feel prescient, offering a view from indoors that, for many, will look like a reflection. –Matthew Ismael Ruiz'
reviews2020[0] = to_replace

In [101]:
def extract_before_listen(strings):
    extracted_parts = []
    for string in strings:
        # Split the string based on "Listen:" or "iListen:"
        parts = string.split("Listen:", 1)
        if len(parts) == 1:
            parts = string.split("iListen:", 1)
        # Take the first part before the split
        extracted_part = parts[0].strip()
        extracted_parts.append(extracted_part)
    return extracted_parts

# Extract the parts before "Listen:" or "iListen:"
extracted_parts = extract_before_listen(reviews2020)

# Print the extracted parts
for part in extracted_parts:
    print(part)

After years of playing in emo bands and releasing candy-coated electropop, Ela Minus splits the difference on “dominique.” The standout from her debut LP, acts of rebellion, is a depressive ode to sleeping all day and never leaving the house set to bright, buoyant melodies. Even as she teeters on the edge of the abyss, wasting away with coffee and liquor as her only companions, she finds humor in the downward spiral: “I should probably eat something that’s not liquid,” she deadpans. Recorded more than a year before much of humanity was sheltering in place, its themes of isolation and delirium feel prescient, offering a view from indoors that, for many, will look like a reflection. –Matthew Ismael Ruiz
The jarring dissonance between a four-on-the-floor acid-house beat and the burden of having a body makes electronic duo Pale Blue’s “I Walk Alone With Acid” more incisive than your average dance banger. “Is my body mine?/Is my fear real?” Elizabeth Wight asks breathily over Mike Simonetti

In [102]:
# Extract the sublists to string

res = []
for i in extracted_parts:
    s = ""
    for j in i:
        s += j
    res.append(s)


In [103]:
# Assuming reviews_2015 is your list of sublists
reviewers2020 = []  # This will store the reviewer names
reviews_only_2020 = []  # This will store the review parts of each sublist

for sublist_str in res:
    if sublist_str:  # Ensure the sublist_str is not empty
        last_element = sublist_str.split('–')[-1]
        first_element = sublist_str[:-len(last_element)-2]
        print(first_element)
        #split('–')[-1]

        # # Regex to find the reviewer's name after a dash, not preceded by a word character
        # match = re.search(r"(.+?)\s*[-—](?![a-zA-Z])(?<![a-zA-Z])", reversed_element)
        # if match:
        #     # Reverse the captured name back to its original order
        #     reviewer = match.group(1)[::-1].strip()
        #     # Also, get the part of the last element before the reviewer's name
        #     index_of_dash = last_element[::-1].find(match.group(0))  # Find where the matched group starts in the reversed string
        #     review_text = last_element[:len(last_element) - index_of_dash - len(match.group(0))].strip()
        # else:
        #     reviewer = "Reviewer name not found"
        #     review_text = last_element  # If no reviewer, consider the entire last element as review text
        
        reviewers2020.append(last_element)
        reviews_only_2020.append(first_element)
        # # Create a new sublist_str with the original elements up to the last one, replacing the last with the review text
        # reviews_only_2020.append(sublist_str[:-1] + [review_text])

After years of playing in emo bands and releasing candy-coated electropop, Ela Minus splits the difference on “dominique.” The standout from her debut LP, acts of rebellion, is a depressive ode to sleeping all day and never leaving the house set to bright, buoyant melodies. Even as she teeters on the edge of the abyss, wasting away with coffee and liquor as her only companions, she finds humor in the downward spiral: “I should probably eat something that’s not liquid,” she deadpans. Recorded more than a year before much of humanity was sheltering in place, its themes of isolation and delirium feel prescient, offering a view from indoors that, for many, will look like a reflection.
The jarring dissonance between a four-on-the-floor acid-house beat and the burden of having a body makes electronic duo Pale Blue’s “I Walk Alone With Acid” more incisive than your average dance banger. “Is my body mine?/Is my fear real?” Elizabeth Wight asks breathily over Mike Simonetti’s propulsive beats, 

### Artists and Songs

In [105]:
artist_song_review = []
artist_songs = []

# Iterate through each <h2> tag
for h2_tag in soup.find_all('h2'):
    # Extract the artist and song
    artist_song = h2_tag.get_text(strip=True)
    artist_songs.append(artist_song)
    
    # Initialize an empty list to hold all paragraphs of the review
    review_paragraphs = []
    
    # Find the next sibling that's a <p> tag and iterate through all subsequent <p> tags
    next_tag = h2_tag.find_next_sibling()
    
    # Keep collecting <p> text until a non-<p> is found
    while next_tag and next_tag.name == 'p':
        review_paragraphs.append(next_tag.get_text(strip=True))
        next_tag = next_tag.find_next_sibling()
    
    # Join all collected paragraphs into a single review string
    review = " ".join(review_paragraphs)
    
    # Append artist, song, and review to the list
    artist_song_review.append(artist_song)
    artist_song_review.append(review)

In [106]:
artist_song_review

['Ela Minus: “dominique”',
 'After years of playing in emo bands and releasing candy-coated electropop, Ela Minus splits the difference on “dominique.” The standout from her debut LP,acts of rebellion, is a depressive ode to sleeping all day and never leaving the house set to bright, buoyant melodies. Even as she teeters on the edge of the abyss, wasting away with coffee and liquor as her only companions, she finds humor in the downward spiral: “I should probably eat something that’s not liquid,” she deadpans. Recorded more than a year before much of humanity was sheltering in place, its themes of isolation and delirium feel prescient, offering a view from indoors that, for many, will look like a reflection. –Matthew Ismael Ruiz',
 'Pale Blue: “I Walk Alone With Acid”',
 'The jarring dissonance between a four-on-the-floor acid-house beat and the burden of having a body makes electronic duo Pale Blue’s “I Walk Alone With Acid” more incisive than your average dance banger. “Is my body mi

In [107]:
# reviews = []
# # This assumes the structure alternates between titles and review content.
# for i in range(1, len(artist_song_review), 2):
#     review = artist_song_review[i]
#     dash_index = review.rfind(' –')
#     if dash_index != -1:
#         review = review[:dash_index]
#     reviews.append(review)

# # Check if there are any reviews missed due to unexpected artist_song_review indexing
# if len(artist_song_review) % 2 != 0:  # If there's an odd number of elements, last may be a review
#     last_review = artist_song_review[-1]
#     dash_index = last_review.rfind(' –')
#     if dash_index != -1:
#         last_review = last_review[:dash_index]
#         reviews.append(last_review)

# for review in reviews:
#     print(review)

In [108]:
# reviews_with_reviewers = []
# # This assumes the structure alternates between titles and review content.
# for i in range(1, len(artist_song_review), 2):
#     review = artist_song_review[i]
#     reviews_with_reviewers.append(review)

# # Check if there are any reviews missed due to unexpected artist_song_review indexing
# if len(artist_song_review) % 2 != 0:  # If there's an odd number of elements, last may be a review
#     last_review = artist_song_review[-1]
#     reviews_with_reviewers.append(last_review)

# for review in reviews_with_reviewers:
#     print(review)

## Artists and Songs

In [109]:
# Initialize lists to store artists and songs
artists2020 = []
songs2020 = []

# Iterate through each element in the artist_songs list
for item in artist_songs:
    # Split the element at the ':' character
    parts = item.split(':')
    if len(parts) == 2:
        # Extract artist and song and remove leading/trailing whitespace
        artist = parts[0].strip()
        song = parts[1].strip()
        # Append artist and song to their respective lists
        artists2020.append(artist)
        songs2020.append(song)

# Print the lists of artists and songs

print(len(artists2020))

print(len(songs2020))

100
100


## Prepare Reviews

## Create Additional Columns

In [110]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2020 = [2020] * 100

source_2020 = ['Pitchfork'] * 100

source_title_2020 = ['The 100 Best Songs of 2020'] * 100

### Check the Length

In [111]:
columns_list2020 = [artists2020, songs2020, year_2020, reviews_only_2020, reviewers2020, rank_100, source_2020, source_title_2020]
for list in columns_list2020:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

100
100
100
100
100
100
100
100


## Create the DF

In [112]:
df2020 = pd.DataFrame({
    'Artist': artists2020,
    'Song': songs2020,
    'Year': year_2020,
    'Review': reviews_only_2020,
    'Reviewer': reviewers2020,
    'Ranking': rank_100,
    'Source': source_2020,
    'Source Title': source_title_2020
})

df2020[df2020['Reviewer'] == 'Unknown']

Unnamed: 0,Artist,Song,Year,Review,Reviewer,Ranking,Source,Source Title


In [113]:
# # #Fill up those unknowns

# reviewers = ['Vrinda Jagota','Matthew Strauss','Jazz Monroe','Puja Patel']

# # Iterate over the reviewers list and replace 'Unknown' one by one
# for reviewer in reviewers:
#     df2020['Reviewer'] = df2020['Reviewer'].replace('Unknown', reviewer, inplace=True)

In [114]:
# df2020.to_csv('df_raw/webscraping/df2020.csv', index=False)

# Pitchfork 2021

In [None]:
url21 = "https://pitchfork.com/features/lists-and-guides/best-songs-2021/"
response = requests.get(url21)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

## Prepare Reviews

In [None]:
# Find all <p> tags that do not contain record labels
relevant_p_tags = [p for p in soup.find_all('p') if not p.find_parent(class_='CaptionWrapper-jSZdqE')]

# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each relevant <p> tag to extract and clean the review text
for tag in relevant_p_tags:
    # Extract the review text
    review_text = tag.get_text(strip=True)
    
    # Check if the review text starts with "Embed is unavailable" or "Photo by"
    if not review_text.startswith("Embed is unavailable") and not re.match(r'^Photo by', review_text):
        # Check if the review starts with two or more underscores or non-alphanumeric characters
        if re.match(r'^[_\W]{2,}', review_text):
            # Remove the starting underscores or non-alphanumeric characters
            cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
        else:
            cleaned_review = review_text

        # Append the cleaned review to the list
        cleaned_reviews.append(cleaned_review)

cleaned_reviews = cleaned_reviews[6:]
cleaned_reviews = cleaned_reviews[:-12]


# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

## Concatenate Reviews

In [None]:
# Initialize variables to store concatenated reviews
concatenated_reviews = []
current_review = ""

# Iterate through the input list
for item in cleaned_reviews:
    # Check if the item starts with "Listen:"
    if item.startswith("Listen:"):
        # If current_review is not empty, add it to concatenated_reviews
        if current_review:
            concatenated_reviews.append(current_review.strip())
            current_review = ""
    else:
        # Append the item to the current_review
        current_review += item + " "

# Print the concatenated reviews
for review in concatenated_reviews:
    print(review)

print(len(concatenated_reviews))
#100 results

## Extract Reviewers

In [None]:
#Extract the reviewer:

# Initialize a list to store reviewer names
reviewer_full_names2021 = []
cleaned_reviews2021 = []


# Iterate through each review in the list
for review in concatenated_reviews:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', review)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        review_full = parts[0].strip()
        cleaned_reviews2021.append(review_full)
        # Extract the reviewer's full name
        reviewer_full_name = parts[1].strip()
        # Append the reviewer's full name to the list
        reviewer_full_names2021.append(reviewer_full_name)

print(len(reviewer_full_names2021))
print(len(cleaned_reviews2021))



## Prepare Artist and Songs

In [None]:
# Initialize variables to store artists and songs
artists_songs = []


# Iterate through the input list
for item in cleaned_reviews:
    # Check if the item starts with "Listen:"
    if item.startswith("Listen:"):
        # Split the item based on the "," delimiter
        artists_songs.append(item)


# Print the extracted artists and songs
for artist in artists_songs:
    print("Artist_song:", artist)

print(len(artists_songs))

## Initialize Variables to Store Artist and Song

In [None]:
# Initialize variables to store artists and songs
artists = []
songs = []

# Iterate through the list of artist, song pairs
for artist_song in artists_songs:
    # Use regex to split the artist and song
    match = re.match(r"^Listen:(.*?), “(.*?)”", artist_song)
    if match:
        artist, song = match.groups()
        # Append the artist and song to their respective lists
        artists.append(artist.strip())
        songs.append(song.strip())
    else:
        # Try splitting on the first occurrence of '"'
        parts = artist_song.split('"', 1)
        if len(parts) == 2:
            artist = parts[0].replace("Listen:", "").strip()
            song = parts[1].strip()
            # Append the artist and song to their respective lists
            artists.append(artist)
            songs.append(song)
        else:
            # Try splitting on the first occurrence of ':'
            parts = artist_song.split(':', 1)
            if len(parts) == 2:
                artist = parts[1].strip().split("“", 1)[0].strip()
                song = parts[1].strip().split("“", 1)[1].strip()
                # Append the artist and song to their respective lists
                artists.append(artist)
                songs.append(song)
            else:
                print("Entry not matched by regex:", artist_song)

## Create Additional Columns

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2021 = [2021] * 100

source_2021 = ['Pitchfork'] * 100

source_title_2021 = ['The 100 Best Songs of 2021'] * 100

### Check the Length

In [None]:
columns_list2021 = [artists, songs, year_2021, cleaned_reviews2021, reviewer_full_names2021, rank_100, source_2021, source_title_2021]
for list in columns_list:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create the DF

In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2021))

# Create a DataFrame from the dictionary
df2021 = pd.DataFrame(data_dict)

df2021.to_csv('df_raw/webscraping/df2021.csv', index=False)

# Pitchfork 2022

In [None]:
url22 = "https://pitchfork.com/features/lists-and-guides/best-songs-2022/"
response = requests.get(url22)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

## Prepare Reviews

In [None]:
# Find all <p> tags that do not contain record labels
relevant_p_tags = [p for p in soup.find_all('p') if not p.find_parent(class_='CaptionWrapper-jSZdqE')]

# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each relevant <p> tag to extract and clean the review text
for tag in relevant_p_tags:
    # Extract the review text
    review_text = tag.get_text(strip=True)
    
    # Check if the review text starts with "Embed is unavailable" or "Photo by"
    if not review_text.startswith("Embed is unavailable") and not re.match(r'^Photo by', review_text):
        # Check if the review starts with two or more underscores or non-alphanumeric characters
        if re.match(r'^[_\W]{2,}', review_text):
            # Remove the starting underscores or non-alphanumeric characters
            cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
        else:
            cleaned_review = review_text

        # Append the cleaned review to the list
        cleaned_reviews.append(cleaned_review)

cleaned_reviews = cleaned_reviews[6:]
cleaned_reviews = cleaned_reviews[:-12]


# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

In [None]:
# Initialize count
count = 0

# Iterate through the list and count elements that start with "Listen:"
for element in cleaned_reviews:
    if element.startswith("Listen:"):
        count += 1

print("Number of elements starting with 'Listen:' =", count)

## Concatenate Reviews

In [None]:
# Initialize variables to store concatenated reviews
concatenated_reviews = []
current_review = ""

# Iterate through the input list
for item in cleaned_reviews:
    # Check if the item starts with "Listen:"
    if item.startswith("Listen:") or item.startswith("Lisen:"):
        # If current_review is not empty, add it to concatenated_reviews
        if current_review:
            concatenated_reviews.append(current_review.strip())
            current_review = ""
    else:
        # Append the item to the current_review
        current_review += item + " "

# Print the concatenated reviews
for review in concatenated_reviews:
    print(review)

print(len(concatenated_reviews))
#100 results

In [None]:
concatenated_reviews

## Extract Reviewers

In [None]:
#Extract the reviewer:

# Initialize a list to store reviewer names
reviewer_full_names = []
cleaned_reviews2022 = []


# Iterate through each review in the list
for review in concatenated_reviews:
    # Split the review based on either "-" or "–"
    parts = re.split(r'[-–](?=[^-–]*$)', review)
    
    # If there are two parts (reviewer's name is found)
    if len(parts) == 2:
        review_full = parts[0].strip()
        cleaned_reviews2022.append(review_full)
        # Extract the reviewer's full name
        reviewer_full_name = parts[1].strip()
        # Append the reviewer's full name to the list
        reviewer_full_names.append(reviewer_full_name)

print(len(reviewer_full_names))
print(len(cleaned_reviews2022))


## Prepare Artist and Songs

In [None]:
# Initialize variables to store artists and songs
artists_songs = []


# Iterate through the input list
for item in cleaned_reviews:
    # Check if the item starts with "Listen:"
    if item.startswith("Listen:") or item.startswith("Lisen:"):
        # Split the item based on the "," delimiter
        artists_songs.append(item)


# Print the extracted artists and songs
for artist in artists_songs:
    print("Artist_song:", artist)

print(len(artists_songs))

## Initialize Variables to Store Artist and Song

In [None]:
# Initialize variables to store artists and songs
artists = []
songs = []

# Iterate through the list of artist, song pairs
for artist_song in artists_songs:
    # Use regex to split the artist and song
    match = re.match(r"^Listen:(.*?), “(.*?)”", artist_song)
    if match:
        artist, song = match.groups()
        # Append the artist and song to their respective lists
        artists.append(artist.strip())
        songs.append(song.strip())
    else:
        # Try splitting on the first occurrence of '"'
        parts = artist_song.split('"', 1)
        if len(parts) == 2:
            artist = parts[0].replace("Listen:", "").strip()
            song = parts[1].strip()
            # Append the artist and song to their respective lists
            artists.append(artist)
            songs.append(song)
        else:
            # Try splitting on the first occurrence of ':'
            parts = artist_song.split(':', 1)
            if len(parts) == 2:
                artist = parts[1].strip().split("“", 1)[0].strip()
                song = parts[1].strip().split("“", 1)[1].strip()
                # Append the artist and song to their respective lists
                artists.append(artist)
                songs.append(song)
            else:
                print("Entry not matched by regex:", artist_song)

## Create Additional Columns

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2022 = [2022] * 100

source_2022 = ['Pitchfork'] * 100

source_title_2022 = ['The 100 Best Songs of 2022'] * 100

### Check the Length

In [None]:
columns_list2022 = [artists, songs, year_2022, cleaned_reviews2022, reviewer_full_names, rank_100, source_2022, source_title_2022]
for list in columns_list:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create the DF

In [None]:
# Create a dictionary from the lists
data_dict = dict(zip(columns_name, columns_list2022))

# Create a DataFrame from the dictionary
df2022 = pd.DataFrame(data_dict)

#df2022.to_csv('df_raw/webscraping/df2022.csv', index=False)


# Pitchfork 2023

In [None]:
# Pitchfork: The 100 Best Songs of 2020
# Check and import pandas if not already imported
if 'pd' not in globals():
    import pandas as pd

# Check and import numpy if not already imported
if 'np' not in globals():
    import numpy as np

# Check and import re if not already imported
if 're' not in globals():
    import re

# Check and import requests if not already imported
if 'requests' not in globals():
    import requests

# Check and import BeautifulSoup from bs4 if not already imported
if 'BeautifulSoup' not in globals():
    from bs4 import BeautifulSoup

In [None]:
url23 = "https://pitchfork.com/features/lists-and-guides/best-songs-2023/"
response = requests.get(url23)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.prettify())  # This formats the HTML in a readable way

In [None]:
def get_reviews(soup):
    reviews = []
    review_parts = []
    skip_next_p = False  # Flag to skip the next <p> if it's considered a label

    for element in soup.descendants:
        if element.name == 'span' and 'caption__text' in element.get('class', []):
            # The next <p> under this <span> should be skipped because it contains the record label
            skip_next_p = True
        elif element.name == 'p':
            if skip_next_p:
                # Skip this <p> because it was flagged as a record label
                skip_next_p = False
                continue
            # Reset the flag in case it's not the record label <p>
            skip_next_p = False
            
            text = element.get_text()
            # Collect the text if it's not the 'Listen:' part
            if 'Listen:' not in text:
                review_parts.append(text)
            else:
                # Reached the end of a review, join the parts and reset for the next review
                review_parts.append(text)
                full_review = ' '.join(review_parts).strip()
                reviews.append(full_review)
                review_parts = []
        elif element.name == 'strong' and element.get_text().isdigit():
            # A new review is starting, capture the previous one if there's any
            if review_parts:
                full_review = ' '.join(review_parts).strip()
                reviews.append(full_review)
                review_parts = []

    return reviews

# Get the reviews
reviews2023 = get_reviews(soup)

# Now you can use the 'reviews' list for any other operations or functions
for review in reviews2023:
    print(review)
    print('---')  # Separator between reviews
    

In [None]:
def extract_before_listen(strings):
    extracted_parts = []
    for string in strings:
        # Split the string based on "Listen:" or "iListen:"
        parts = string.split("Listen:", 1)
        if len(parts) == 1:
            parts = string.split("iListen:", 1)
        # Take the first part before the split
        extracted_part = parts[0].strip()
        extracted_parts.append(extracted_part)
    return extracted_parts

# Extract the parts before "Listen:" or "iListen:"
extracted_parts = extract_before_listen(reviews2023)

# Print the extracted parts
for part in extracted_parts:
    print(part)

In [None]:
# Extract the sublists to string

res = []
for i in extracted_parts:
    s = ""
    for j in i:
        s += j
    res.append(s)


In [None]:
# Assuming reviews_2015 is your list of sublists
reviewers2023 = []  # This will store the reviewer names
reviews_only_2023 = []  # This will store the review parts of each sublist

for sublist_str in res:
    if sublist_str:  # Ensure the sublist_str is not empty
        last_element = sublist_str.split('–')[-1]
        first_element = sublist_str[:-len(last_element)-2]
        print(first_element)
        #split('–')[-1]

        # # Regex to find the reviewer's name after a dash, not preceded by a word character
        # match = re.search(r"(.+?)\s*[-—](?![a-zA-Z])(?<![a-zA-Z])", reversed_element)
        # if match:
        #     # Reverse the captured name back to its original order
        #     reviewer = match.group(1)[::-1].strip()
        #     # Also, get the part of the last element before the reviewer's name
        #     index_of_dash = last_element[::-1].find(match.group(0))  # Find where the matched group starts in the reversed string
        #     review_text = last_element[:len(last_element) - index_of_dash - len(match.group(0))].strip()
        # else:
        #     reviewer = "Reviewer name not found"
        #     review_text = last_element  # If no reviewer, consider the entire last element as review text
        
        reviewers2023.append(last_element)
        reviews_only_2023.append(first_element)
        # # Create a new sublist_str with the original elements up to the last one, replacing the last with the review text
        # reviews_only_2020.append(sublist_str[:-1] + [review_text])

In [None]:
to_replace = 'On her first solo single, former CCFX crooner Mary Jane Dunphe describes a solitary, disorienting view of longing. The boiling point of her sensory confusion is right on the surface, bubbling out of the blown-out bass, tactile beats, and feral, distorted guitar riff. Dunphe’s deep voice seems to move in slow motion as she surveys the chaos. “Love is giving something you don’t have to someone who doesn’t want it,” wrote the French psychoanalyst Jacques Lacan, a scary thought at the heart of Dunphe’s battle cry. It’s reason enough to stay single.'
reviews_only_2023[0] = to_replace

In [None]:
print(len(reviewers2023))
print(len(reviews_only_2023))

## Concatenate Reviews

In [None]:
# # Initialize variables to store concatenated reviews
# concatenated_reviews = []
# current_review = ""

# # Iterate through the input list
# for item in cleaned_reviews:
#     # Check if the item starts with "Listen:"
#     if item.startswith("Listen:") or item.startswith("Lisen:"):
#         # If current_review is not empty, add it to concatenated_reviews
#         if current_review:
#             concatenated_reviews.append(current_review.strip())
#             current_review = ""
#     else:
#         # Append the item to the current_review
#         current_review += item + " "

# # Print the concatenated reviews
# for review in concatenated_reviews:
#     print(review)

# print(len(concatenated_reviews))
# #100 results

## Extract Reviewers

In [None]:
# #Extract the reviewer:

# # Initialize a list to store reviewer names
# reviewer_full_names = []

# # Iterate through each review in the list
# for review in concatenated_reviews:
#     # Split the review based on either "-" or "–"
#     parts = re.split(r'[-–](?=[^-–]*$)', review)
    
#     # If there are two parts (reviewer's name is found)
#     if len(parts) == 2:
#         # Extract the reviewer's full name
#         reviewer_full_name = parts[1].strip()
#         # Append the reviewer's full name to the list
#         reviewer_full_names.append(reviewer_full_name)

# # Print the extracted reviewer full names
# for full_name in reviewer_full_names:
#     print("Reviewer's Full Name:", full_name)
#     print()

# len(reviewer_full_names)


## Prepare Artist and Songs

In [None]:
# Find all <p> tags that do not contain record labels
relevant_p_tags = [p for p in soup.find_all('p') if not p.find_parent(class_='CaptionWrapper-jSZdqE')]

# List to store the cleaned reviews
cleaned_reviews = []

# Iterate through each relevant <p> tag to extract and clean the review text
for tag in relevant_p_tags:
    # Extract the review text
    review_text = tag.get_text(strip=True)
    
    # Check if the review text starts with "Embed is unavailable" or "Photo by"
    if not review_text.startswith("Embed is unavailable") and not re.match(r'^Photo by', review_text):
        # Check if the review starts with two or more underscores or non-alphanumeric characters
        if re.match(r'^[_\W]{2,}', review_text):
            # Remove the starting underscores or non-alphanumeric characters
            cleaned_review = re.sub(r'^[_\W]{2,}', '', review_text)
        else:
            cleaned_review = review_text

        # Append the cleaned review to the list
        cleaned_reviews.append(cleaned_review)

cleaned_reviews = cleaned_reviews[7:]
cleaned_reviews = cleaned_reviews[:-12]


# Display the cleaned reviews
for review in cleaned_reviews:
    print(review)

In [None]:
# Initialize variables to store artists and songs
artists_songs = []


# Iterate through the input list
for item in cleaned_reviews:
    # Check if the item starts with "Listen:"
    if item.startswith("Listen:") or item.startswith("Lisen:"):
        # Split the item based on the "," delimiter
        artists_songs.append(item)


# Print the extracted artists and songs
for artist in artists_songs:
    print("Artist_song:", artist)

print(len(artists_songs))

## Initialize Variables to Store Artist and Song

In [None]:
# Initialize variables to store artists and songs
artists = []
songs = []

# Iterate through the list of artist, song pairs
for artist_song in artists_songs:
    # Use regex to split the artist and song
    match = re.match(r"^Listen:(.*?), “(.*?)”", artist_song)
    if match:
        artist, song = match.groups()
        # Append the artist and song to their respective lists
        artists.append(artist.strip())
        songs.append(song.strip())
    else:
        # Try splitting on the first occurrence of '"'
        parts = artist_song.split('"', 1)
        if len(parts) == 2:
            artist = parts[0].replace("Listen:", "").strip()
            song = parts[1].strip()
            # Append the artist and song to their respective lists
            artists.append(artist)
            songs.append(song)
        else:
            # Try splitting on the first occurrence of ':'
            parts = artist_song.split(':', 1)
            if len(parts) == 2:
                artist = parts[1].strip().split("“", 1)[0].strip()
                song = parts[1].strip().split("“", 1)[1].strip()
                # Append the artist and song to their respective lists
                artists.append(artist)
                songs.append(song)
            else:
                print("Entry not matched by regex:", artist_song)

## Create Additional Columns

In [None]:
# Create the ranking: 

rank_100 = [i for i in range(100, 0, -1)]

year_2023 = [2023] * 100

source_2023 = ['Pitchfork'] * 100

source_title_2023 = ['The 100 Best Songs of 2023'] * 100

### Check the Length

In [None]:
columns_list = [artists, songs, year_2023, reviews_only_2023, reviewers2023, rank_100, source_2023, source_title_2023]
for list in columns_list:
    print(len(list))

columns_name = ['Artist', 'Song', 'Year', 'Review', 'Reviewer','Ranking','Source','Source Title']

## Create the DF

In [None]:
df2023 = pd.DataFrame({
    'Artist': artists,
    'Song': songs,
    'Year': year_2023,
    'Review': reviews_only_2023,
    'Reviewer': reviewers2023,
    'Ranking': rank_100,
    'Source': source_2023,
    'Source Title': source_title_2023
})

#df2023.to_csv('df_raw/webscraping/df2023.csv', index=False)

# Merge the DF