In [1]:
import pandas as pd


In [7]:
import pandas as pd

# Specify the path to the uploaded dataset file
file_path = '/content/Spotify Million Song Dataset_exported.csv'  # Replace with the actual file path

# Attempt to read the dataset into a DataFrame
try:
    spotify_df = pd.read_csv(file_path, encoding='utf-8')
    print("Dataset loaded successfully!")
except pd.errors.ParserError as e:
    print("Error loading dataset:", e)
    print("Attempting to read with error_bad_lines=False...")
    # Attempt to read the dataset with error_bad_lines=False
    try:
        spotify_df = pd.read_csv(file_path, error_bad_lines=False)
        print("Dataset loaded successfully with error_bad_lines=False!")
    except pd.errors.ParserError as e:
        print("Error loading dataset even with error_bad_lines=False:", e)


Dataset loaded successfully!


In [8]:
# Display the first few rows of the dataset
print(spotify_df.head())

# Get information about the dataset, including data types and missing values
print(spotify_df.info())

# Summary statistics for numerical columns
print(spotify_df.describe())


  artist                   song                                        link  \
0   ABBA  Ahe's My Kind Of Girl  /a/abba/ahes+my+kind+of+girl_20598417.html   
1   ABBA       Andante, Andante       /a/abba/andante+andante_20002708.html   
2   ABBA         As Good As New        /a/abba/as+good+as+new_20003033.html   
3   ABBA                   Bang                  /a/abba/bang_20598415.html   
4   ABBA       Bang-A-Boomerang      /a/abba/bang+a+boomerang_20002668.html   

                                                text  
0  Look at her face, it's a wonderful face  \nAnd...  
1  Take it easy with me, please  \nTouch me gentl...  
2  I'll never know why I had to go  \nWhy I had t...  
3  Making somebody happy is a question of give an...  
4  Making somebody happy is a question of give an...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57

In [9]:
# Handle missing values
spotify_df.dropna(inplace=True)

# Remove duplicate rows
spotify_df.drop_duplicates(inplace=True)


In [12]:
import nltk
from nltk.tokenize import word_tokenize

# Download required resources for tokenization (run this line only once)
nltk.download('punkt')

# Print the length of the DataFrame before tokenization
print("Number of rows before tokenization:", len(spotify_df))

# Tokenize words in each row of the DataFrame
spotify_df['tokenized_text'] = spotify_df.apply(lambda row: word_tokenize(row['artist']), axis=1)

# Display the DataFrame with tokenized words
print(spotify_df.head())

# Print the length of the DataFrame after tokenization
print("Number of rows after tokenization:", len(spotify_df))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Number of rows before tokenization: 57650
  artist                   song                                        link  \
0   ABBA  Ahe's My Kind Of Girl  /a/abba/ahes+my+kind+of+girl_20598417.html   
1   ABBA       Andante, Andante       /a/abba/andante+andante_20002708.html   
2   ABBA         As Good As New        /a/abba/as+good+as+new_20003033.html   
3   ABBA                   Bang                  /a/abba/bang_20598415.html   
4   ABBA       Bang-A-Boomerang      /a/abba/bang+a+boomerang_20002668.html   

                                                text tokenized_text  
0  Look at her face, it's a wonderful face  \nAnd...         [ABBA]  
1  Take it easy with me, please  \nTouch me gentl...         [ABBA]  
2  I'll never know why I had to go  \nWhy I had t...         [ABBA]  
3  Making somebody happy is a question of give an...         [ABBA]  
4  Making somebody happy is a question of give an...         [ABBA]  
Number of rows after tokenization: 57650


In [14]:
import nltk
nltk.download('stopwords')


# Function to clean text
def clean_text(tokens):
    tokens = [word.lower() for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

# Apply text cleaning to tokenized words
spotify_df['cleaned_text'] = spotify_df['tokenized_text'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
print(spotify_df.head())


  artist                   song                                        link  \
0   ABBA  Ahe's My Kind Of Girl  /a/abba/ahes+my+kind+of+girl_20598417.html   
1   ABBA       Andante, Andante       /a/abba/andante+andante_20002708.html   
2   ABBA         As Good As New        /a/abba/as+good+as+new_20003033.html   
3   ABBA                   Bang                  /a/abba/bang_20598415.html   
4   ABBA       Bang-A-Boomerang      /a/abba/bang+a+boomerang_20002668.html   

                                                text tokenized_text  \
0  Look at her face, it's a wonderful face  \nAnd...         [ABBA]   
1  Take it easy with me, please  \nTouch me gentl...         [ABBA]   
2  I'll never know why I had to go  \nWhy I had t...         [ABBA]   
3  Making somebody happy is a question of give an...         [ABBA]   
4  Making somebody happy is a question of give an...         [ABBA]   

  cleaned_text  
0       [abba]  
1       [abba]  
2       [abba]  
3       [abba]  
4       [abba

In [16]:
# Save the DataFrame to a CSV file
spotify_df.to_csv('spotify_dataset.csv', index=False)

# Provide a link to download the CSV file
from google.colab import files
files.download('spotify_dataset.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>