In [10]:
# import libraries
import pandas as pd

In [11]:
import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer

In [12]:
# download nltk corpus (first time only)
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\ojast\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\ojast\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\ojast\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\ojast\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\ojast\AppData\Roaming\nltk_data...
[

True

In [13]:
# Load the amazon review dataset
df = pd.read_excel('seeds.xlsx')
df

Unnamed: 0,Customer Name,Gender,Comment,Website,Seed Type,Label,Date
0,"Natraj B, Bangalore",1,good quality,Bigbasket,Pumpkin Seeds,-1,2024-02-12
1,"Sandhya Rani, Bangalore",0,"become black , bad smell , bad quality , old s...",Bigbasket,Sunflower Seeds,-1,2024-06-11
2,Shovan Chakraborty,1,good,Amazon,Flax Seeds,1,2023-08-10
3,Placeholder,-1,nice,Amazon,Seasame Seeds,1,2024-06-30
4,Vinit Ranjan,1,"good product , nicely packed - even check auth...",Flipkart,Pumpkin Seeds,1,2024-01-29
...,...,...,...,...,...,...,...
2976,Krishna Pakala,1,good product,Bigbasket,Flax Seeds,1,2024-06-19
2977,vivek mishra,1,using chemical colour,Amazon,Seasame Seeds,-1,2023-07-27
2978,Vineesh Peralassery,1,super product healthy food,Flipkart,Pumpkin Seeds,1,2023-12-14
2979,sourabh bisht,1,quality much good .seeds smalli buy better qua...,Amazon,Flax Seeds,-1,2023-12-30


In [14]:
df['Comment'].fillna('good', inplace=True)
df.loc[df['Comment'] == 'good', 'Label'] = 0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Comment'].fillna('good', inplace=True)


In [15]:
# create preprocess_text function
def preprocess_text(text):

    # Tokenize the text

    tokens = word_tokenize(text.lower())

    # Remove stop words

    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    # Lemmatize the tokens

    lemmatizer = WordNetLemmatizer()

    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string

    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

# apply the function df

df['Comment'] = df['Comment'].apply(preprocess_text)
df

Unnamed: 0,Customer Name,Gender,Comment,Website,Seed Type,Label,Date
0,"Natraj B, Bangalore",1,good quality,Bigbasket,Pumpkin Seeds,-1,2024-02-12
1,"Sandhya Rani, Bangalore",0,"become black , bad smell , bad quality , old s...",Bigbasket,Sunflower Seeds,-1,2024-06-11
2,Shovan Chakraborty,1,good,Amazon,Flax Seeds,0,2023-08-10
3,Placeholder,-1,nice,Amazon,Seasame Seeds,1,2024-06-30
4,Vinit Ranjan,1,"good product , nicely packed - even check auth...",Flipkart,Pumpkin Seeds,1,2024-01-29
...,...,...,...,...,...,...,...
2976,Krishna Pakala,1,good product,Bigbasket,Flax Seeds,1,2024-06-19
2977,vivek mishra,1,using chemical colour,Amazon,Seasame Seeds,-1,2023-07-27
2978,Vineesh Peralassery,1,super product healthy food,Flipkart,Pumpkin Seeds,1,2023-12-14
2979,sourabh bisht,1,quality much good .seeds smalli buy better qua...,Amazon,Flax Seeds,-1,2023-12-30


In [16]:
analyzer = SentimentIntensityAnalyzer()


# create get_sentiment function

def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    
    if scores['compound'] > 0.05:
        return 1  # Positive
    elif scores['compound'] < -0.05:
        return -1  # Negative
    else:
        return 0  # Neutral

df['Sentiment'] = df['Comment'].apply(get_sentiment)

df

Unnamed: 0,Customer Name,Gender,Comment,Website,Seed Type,Label,Date,Sentiment
0,"Natraj B, Bangalore",1,good quality,Bigbasket,Pumpkin Seeds,-1,2024-02-12,1
1,"Sandhya Rani, Bangalore",0,"become black , bad smell , bad quality , old s...",Bigbasket,Sunflower Seeds,-1,2024-06-11,-1
2,Shovan Chakraborty,1,good,Amazon,Flax Seeds,0,2023-08-10,1
3,Placeholder,-1,nice,Amazon,Seasame Seeds,1,2024-06-30,1
4,Vinit Ranjan,1,"good product , nicely packed - even check auth...",Flipkart,Pumpkin Seeds,1,2024-01-29,1
...,...,...,...,...,...,...,...,...
2976,Krishna Pakala,1,good product,Bigbasket,Flax Seeds,1,2024-06-19,1
2977,vivek mishra,1,using chemical colour,Amazon,Seasame Seeds,-1,2023-07-27,0
2978,Vineesh Peralassery,1,super product healthy food,Flipkart,Pumpkin Seeds,1,2023-12-14,1
2979,sourabh bisht,1,quality much good .seeds smalli buy better qua...,Amazon,Flax Seeds,-1,2023-12-30,1


In [17]:
df.head(15)

Unnamed: 0,Customer Name,Gender,Comment,Website,Seed Type,Label,Date,Sentiment
0,"Natraj B, Bangalore",1,good quality,Bigbasket,Pumpkin Seeds,-1,2024-02-12,1
1,"Sandhya Rani, Bangalore",0,"become black , bad smell , bad quality , old s...",Bigbasket,Sunflower Seeds,-1,2024-06-11,-1
2,Shovan Chakraborty,1,good,Amazon,Flax Seeds,0,2023-08-10,1
3,Placeholder,-1,nice,Amazon,Seasame Seeds,1,2024-06-30,1
4,Vinit Ranjan,1,"good product , nicely packed - even check auth...",Flipkart,Pumpkin Seeds,1,2024-01-29,1
5,RAKESH K.,1,bad product,Amazon,Sunflower Seeds,-1,2024-01-10,-1
6,"raka chakraborty, Kolkata",0,superfood chia,Bigbasket,Chia Seeds,1,2024-01-03,0
7,Hitasha,0,lesser quantity mentio ed sent,Amazon,Seasame Seeds,-1,2023-11-23,0
8,Masoora Zahoor,0,nice healthy used remedy,Flipkart,Flax Seeds,1,2023-09-06,1
9,"Mangalagiri Spandana, Krishna District",0,super quality,Bigbasket,Sunflower Seeds,1,2023-09-29,1


In [18]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(df['Label'], df['Sentiment']))

[[ 264  162  157]
 [   9   31  363]
 [  24  148 1823]]


In [19]:
from sklearn.metrics import classification_report

print(classification_report(df['Label'], df['Sentiment']))

              precision    recall  f1-score   support

          -1       0.89      0.45      0.60       583
           0       0.09      0.08      0.08       403
           1       0.78      0.91      0.84      1995

    accuracy                           0.71      2981
   macro avg       0.59      0.48      0.51      2981
weighted avg       0.71      0.71      0.69      2981



In [24]:
df.to_csv('seeds.csv', index=False)