In [33]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [37]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PF2L6BL6\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PF2L6BL6\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PF2L6BL6\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\PF2L6BL6\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [62]:
df = pd.read_csv('../data/raw_data/SPOTIFY_REVIEWS.csv')

In [63]:
# Step 1: Remove emoticons
def remove_emoticons(text):
    emoticon_pattern = re.compile(
        u'([\U0001F600-\U0001F64F])|'  
        u'([\U0001F300-\U0001F5FF])|'  
        u'([\U0001F680-\U0001F6FF])|'  
        u'([\U0001F1E0-\U0001F1FF])',  
        flags=re.UNICODE)
    return emoticon_pattern.sub(r'', text) if isinstance(text, str) else None

df['review_text'] = df['review_text'].apply(remove_emoticons)


print("After removing emoticons:")
print(df)

After removing emoticons:
         Unnamed: 0                             review_id  \
0                 0  14a011a8-7544-47b4-8480-c502af0ac26f   
1                 1  bfa8876b-470e-4640-83a7-77427f7f37e8   
2                 2  70e8252f-058a-47d9-b066-df9e1571c970   
3                 3  672a155a-e81e-4d28-bdeb-a74c031bc072   
4                 4  bbc1bf95-ed36-41a1-8b98-0f2e314caea5   
...             ...                                   ...   
3377418     3377418  41cafb32-c357-48df-97a3-e3867584f0d6   
3377419     3377419  ad68b04a-ff48-42f6-992b-bd765cdf9e94   
3377420     3377420  9c36196d-12bc-4049-b7e4-27774a644bed   
3377421     3377421  ce45e1f8-78ad-40cc-a33b-d4e4672b8c78   
3377422     3377422  be4f53b5-8137-45db-a652-265d0518a174   

              pseudo_author_id       author_name  \
0        152618553977019693742     A Google user   
1        234382942865437071667     A Google user   
2        174473604608358796368     A Google user   
3        286593453219054880269   

In [64]:
# Step 3: Remove reviews that contain only one word
df = df[df['review_text'].str.split().str.len() > 2]

print("\nAfter removing one-word reviews:")
print(df)


After removing one-word reviews:
         Unnamed: 0                             review_id  \
0                 0  14a011a8-7544-47b4-8480-c502af0ac26f   
1                 1  bfa8876b-470e-4640-83a7-77427f7f37e8   
2                 2  70e8252f-058a-47d9-b066-df9e1571c970   
3                 3  672a155a-e81e-4d28-bdeb-a74c031bc072   
4                 4  bbc1bf95-ed36-41a1-8b98-0f2e314caea5   
...             ...                                   ...   
3377417     3377417  f758a94e-b824-445d-8011-0e40bf107f0d   
3377418     3377418  41cafb32-c357-48df-97a3-e3867584f0d6   
3377420     3377420  9c36196d-12bc-4049-b7e4-27774a644bed   
3377421     3377421  ce45e1f8-78ad-40cc-a33b-d4e4672b8c78   
3377422     3377422  be4f53b5-8137-45db-a652-265d0518a174   

              pseudo_author_id       author_name  \
0        152618553977019693742     A Google user   
1        234382942865437071667     A Google user   
2        174473604608358796368     A Google user   
3        2865934532190548

In [65]:
# Step 4: Lowercase the text
df['review_text'] = df['review_text'].str.lower()

print("\nAfter lowercasing:")
print(df)


After lowercasing:
         Unnamed: 0                             review_id  \
0                 0  14a011a8-7544-47b4-8480-c502af0ac26f   
1                 1  bfa8876b-470e-4640-83a7-77427f7f37e8   
2                 2  70e8252f-058a-47d9-b066-df9e1571c970   
3                 3  672a155a-e81e-4d28-bdeb-a74c031bc072   
4                 4  bbc1bf95-ed36-41a1-8b98-0f2e314caea5   
...             ...                                   ...   
3377417     3377417  f758a94e-b824-445d-8011-0e40bf107f0d   
3377418     3377418  41cafb32-c357-48df-97a3-e3867584f0d6   
3377420     3377420  9c36196d-12bc-4049-b7e4-27774a644bed   
3377421     3377421  ce45e1f8-78ad-40cc-a33b-d4e4672b8c78   
3377422     3377422  be4f53b5-8137-45db-a652-265d0518a174   

              pseudo_author_id       author_name  \
0        152618553977019693742     A Google user   
1        234382942865437071667     A Google user   
2        174473604608358796368     A Google user   
3        286593453219054880269     A Go

In [66]:
# Step 5: Remove punctuation and non-alphabetic characters
df['review_text'] = df['review_text'].str.replace(r'[^a-z\s]', '', regex=True)

print("\nAfter removing punctuation and non-alphabetic characters:")
print(df)


After removing punctuation and non-alphabetic characters:
         Unnamed: 0                             review_id  \
0                 0  14a011a8-7544-47b4-8480-c502af0ac26f   
1                 1  bfa8876b-470e-4640-83a7-77427f7f37e8   
2                 2  70e8252f-058a-47d9-b066-df9e1571c970   
3                 3  672a155a-e81e-4d28-bdeb-a74c031bc072   
4                 4  bbc1bf95-ed36-41a1-8b98-0f2e314caea5   
...             ...                                   ...   
3377417     3377417  f758a94e-b824-445d-8011-0e40bf107f0d   
3377418     3377418  41cafb32-c357-48df-97a3-e3867584f0d6   
3377420     3377420  9c36196d-12bc-4049-b7e4-27774a644bed   
3377421     3377421  ce45e1f8-78ad-40cc-a33b-d4e4672b8c78   
3377422     3377422  be4f53b5-8137-45db-a652-265d0518a174   

              pseudo_author_id       author_name  \
0        152618553977019693742     A Google user   
1        234382942865437071667     A Google user   
2        174473604608358796368     A Google user   


In [67]:
# Step 6: Final cleaned review text by stripping leading and trailing whitespace
df['review_text'] = df['review_text'].str.strip()

print("\nFinal cleaned review text:")
print(df)


Final cleaned review text:
         Unnamed: 0                             review_id  \
0                 0  14a011a8-7544-47b4-8480-c502af0ac26f   
1                 1  bfa8876b-470e-4640-83a7-77427f7f37e8   
2                 2  70e8252f-058a-47d9-b066-df9e1571c970   
3                 3  672a155a-e81e-4d28-bdeb-a74c031bc072   
4                 4  bbc1bf95-ed36-41a1-8b98-0f2e314caea5   
...             ...                                   ...   
3377417     3377417  f758a94e-b824-445d-8011-0e40bf107f0d   
3377418     3377418  41cafb32-c357-48df-97a3-e3867584f0d6   
3377420     3377420  9c36196d-12bc-4049-b7e4-27774a644bed   
3377421     3377421  ce45e1f8-78ad-40cc-a33b-d4e4672b8c78   
3377422     3377422  be4f53b5-8137-45db-a652-265d0518a174   

              pseudo_author_id       author_name  \
0        152618553977019693742     A Google user   
1        234382942865437071667     A Google user   
2        174473604608358796368     A Google user   
3        286593453219054880269 

In [68]:
df = df.sort_values(by=['review_timestamp'], ascending=False)
print("\sort by timestamp:")
print(df)

\sort by timestamp:
         Unnamed: 0                             review_id  \
3377422     3377422  be4f53b5-8137-45db-a652-265d0518a174   
3377421     3377421  ce45e1f8-78ad-40cc-a33b-d4e4672b8c78   
3377420     3377420  9c36196d-12bc-4049-b7e4-27774a644bed   
3377418     3377418  41cafb32-c357-48df-97a3-e3867584f0d6   
3377417     3377417  f758a94e-b824-445d-8011-0e40bf107f0d   
...             ...                                   ...   
4                 4  bbc1bf95-ed36-41a1-8b98-0f2e314caea5   
3                 3  672a155a-e81e-4d28-bdeb-a74c031bc072   
2                 2  70e8252f-058a-47d9-b066-df9e1571c970   
1                 1  bfa8876b-470e-4640-83a7-77427f7f37e8   
0                 0  14a011a8-7544-47b4-8480-c502af0ac26f   

              pseudo_author_id       author_name  \
3377422  325323583553822680222           Su***ha   
3377421  415238355983315158902           St***en   
3377420  212802607434520579896     An*********ey   
3377418  326345911143311484421  su*****

In [70]:
df = df[['review_id', 'review_text', 'review_rating', 'review_timestamp']]

In [71]:
df

Unnamed: 0,review_id,review_text,review_rating,review_timestamp
3377422,be4f53b5-8137-45db-a652-265d0518a174,ads again again n again worst app,1,2023-11-15 23:16:10
3377421,ce45e1f8-78ad-40cc-a33b-d4e4672b8c78,awesome but you cant choose your song sometimes,4,2023-11-15 23:15:45
3377420,9c36196d-12bc-4049-b7e4-27774a644bed,the blues always pull me through,5,2023-11-15 23:11:32
3377418,41cafb32-c357-48df-97a3-e3867584f0d6,does not work always says no internet but u tu...,1,2023-11-15 23:11:02
3377417,f758a94e-b824-445d-8011-0e40bf107f0d,bring back liked songs button new add button i...,2,2023-11-15 23:06:56
...,...,...,...,...
4,bbc1bf95-ed36-41a1-8b98-0f2e314caea5,as a professional android developer im glad to...,5,2014-05-27 15:26:48
3,672a155a-e81e-4d28-bdeb-a74c031bc072,awesome ui best music app out there,5,2014-05-27 15:17:20
2,70e8252f-058a-47d9-b066-df9e1571c970,love it especially the new design,5,2014-05-27 14:40:01
1,bfa8876b-470e-4640-83a7-77427f7f37e8,i enjoy the awesome ui of this app and it has ...,5,2014-05-27 14:36:02


In [72]:
df.to_csv('../data/processed/SPOTIFY_REVIEWS.csv')