In [1]:
import numpy as np
import pandas as pd

In [2]:
import nltk

In [3]:
from nltk.corpus import stopwords
from textblob import Word
from textblob import TextBlob
import seaborn as sns
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode,  iplot
init_notebook_mode(connected=True)

In [4]:
spotify=pd.read_csv("Dataset/reviews.csv",usecols=["Review","Rating"]) # Reading the dataset and using only the review and rating columns
data=spotify.copy()
data.head()

Unnamed: 0,Review,Rating
0,"Great music service, the audio is high quality...",5
1,Please ignore previous negative rating. This a...,5
2,"This pop-up ""Get the best Spotify experience o...",4
3,Really buggy and terrible to use as of recently,1
4,Dear Spotify why do I get songs that I didn't ...,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  61594 non-null  object
 1   Rating  61594 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 962.5+ KB


remove Punctuation

In [6]:
import re
data["Review"]=data["Review"].apply(lambda x: re.sub('[^a-z A-Z 0-9-]+', '', x))
data.head(5)

Unnamed: 0,Review,Rating
0,Great music service the audio is high quality ...,5
1,Please ignore previous negative rating This ap...,5
2,This pop-up Get the best Spotify experience on...,4
3,Really buggy and terrible to use as of recently,1
4,Dear Spotify why do I get songs that I didnt p...,1


lowercase

In [7]:
data["Review"]=data["Review"].apply(lambda x: " ".join(x.lower() for x in x.split()))
data.head(5)

Unnamed: 0,Review,Rating
0,great music service the audio is high quality ...,5
1,please ignore previous negative rating this ap...,5
2,this pop-up get the best spotify experience on...,4
3,really buggy and terrible to use as of recently,1
4,dear spotify why do i get songs that i didnt p...,1


whitespace removal

In [8]:
data["Review"]=data["Review"].apply(lambda x: " ".join(x.split()))
data.head(5)

Unnamed: 0,Review,Rating
0,great music service the audio is high quality ...,5
1,please ignore previous negative rating this ap...,5
2,this pop-up get the best spotify experience on...,4
3,really buggy and terrible to use as of recently,1
4,dear spotify why do i get songs that i didnt p...,1


remove numbers

In [9]:
data["Review"]=data["Review"].apply(lambda x: re.sub('\d+', '', x))
data.head(5)

Unnamed: 0,Review,Rating
0,great music service the audio is high quality ...,5
1,please ignore previous negative rating this ap...,5
2,this pop-up get the best spotify experience on...,4
3,really buggy and terrible to use as of recently,1
4,dear spotify why do i get songs that i didnt p...,1


removing urls and tags

In [10]:
data["Review"]=data["Review"].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))
data.head(5)

Unnamed: 0,Review,Rating
0,great music service the audio is high quality ...,5
1,please ignore previous negative rating this ap...,5
2,this pop-up get the best spotify experience on...,4
3,really buggy and terrible to use as of recently,1
4,dear spotify why do i get songs that i didnt p...,1


stopword removal

In [11]:
import nltk
from nltk.corpus import stopwords
sw=stopwords.words("english")
data["Review"]=data["Review"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
data.head(5)

Unnamed: 0,Review,Rating
0,great music service audio high quality app eas...,5
1,please ignore previous negative rating app sup...,5
2,pop-up get best spotify experience android ann...,4
3,really buggy terrible use recently,1
4,dear spotify get songs didnt put playlist shuf...,1


lemmatization

In [12]:
from textblob import Word
data["Review"]=data["Review"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data.head()

Unnamed: 0,Review,Rating
0,great music service audio high quality app eas...,5
1,please ignore previous negative rating app sup...,5
2,pop-up get best spotify experience android ann...,4
3,really buggy terrible use recently,1
4,dear spotify get song didnt put playlist shuff...,1


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  61594 non-null  object
 1   Rating  61594 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 962.5+ KB


eda

In [14]:
fig=px.histogram(data,x="Rating",color="Rating",title="Rating Distribution",width=800,height=500)
iplot(fig)

In [15]:
data.isnull().sum()

Review    0
Rating    0
dtype: int64

In [16]:
data["Rating"].replace(1, value="neg",inplace=True)
data["Rating"].replace(2, value="neg",inplace=True)
data["Rating"].replace(3, value="neutral",inplace=True)
data["Rating"].replace(4, value="pos",inplace=True)
data["Rating"].replace(5, value="pos",inplace=True)
data.head()

Unnamed: 0,Review,Rating
0,great music service audio high quality app eas...,pos
1,please ignore previous negative rating app sup...,pos
2,pop-up get best spotify experience android ann...,pos
3,really buggy terrible use recently,neg
4,dear spotify get song didnt put playlist shuff...,neg


In [17]:
data["Rating"].value_counts()

pos        29937
neg        24771
neutral     6886
Name: Rating, dtype: int64

In [18]:
fig1=px.histogram(data,x="Rating",color="Rating",title="Rating Distribution",width=800,height=500)
iplot(fig1)