<a href="https://colab.research.google.com/github/malabyte/Text-Mining-Analytics/blob/main/Sentiment_Analysis_of_Phone_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd  # 'pandas' library
import sys  # 'sys' module
import re  # 're' module for regular expressions
from nltk.tokenize import word_tokenize  #'word_tokenize' function from 'nltk.tokenize' to tokenize words
from nltk.corpus import stopwords  # 'stopwords' from 'nltk.corpus' module
from nltk.sentiment import SentimentIntensityAnalyzer  # 'SentimentIntensityAnalyzer' class from 'nltk.sentiment' module

In [None]:
# Read the 'PhoneReviews' file and store the xlsx file in data
data = pd.read_excel('PhoneReviews.xlsx')
# Create a new df of 'PhoneReviews'
PhoneReviews_df = data[['Product Name', 'Reviews','Rating']]

In [None]:
# Display first few rows of data
data.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0


In [None]:
# This function maps a rating to a corresponding sentiment category

def get_sentiment_category(rating):
    # If the rating is 5, it represents a highly positive sentiment
    if rating == 5:
        return 'very happy'

    # If the rating is 4, it represents a positive sentiment
    elif rating == 4:
        return 'happy'

    # If the rating is 3, it represents a neutral sentiment
    elif rating == 3:
        return 'neutral'

    # If the rating is 2, it represents a negative sentiment
    elif rating == 2:
        return 'unhappy'

    # If the rating is 1, it represents a highly negative sentiment
    elif rating == 1:
        return 'very unhappy'

In [None]:
# Assign sentiment categories from the 'Rating' column using the 'get_sentiment_category' function
PhoneReviews_df['Sentiment'] = PhoneReviews_df['Rating'].apply(get_sentiment_category)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PhoneReviews_df['Sentiment'] = PhoneReviews_df['Rating'].apply(get_sentiment_category)


In [None]:
# Display first few rows of data
PhoneReviews_df.head()

Unnamed: 0,Product Name,Reviews,Rating,Sentiment
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",I feel so LUCKY to have found this used (phone...,5,very happy
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...","nice phone, nice up grade from my pantach revu...",4,happy
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Very pleased,5,very happy
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",It works good but it goes slow sometimes but i...,4,happy
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Great phone to replace my lost phone. The only...,4,happy


In [None]:
# Convert the 'Reviews' column of the df to a list
corpus = PhoneReviews_df['Reviews'].tolist()

In [None]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra white spaces
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    return text

# Clean the corpus using the 'clean_text' function
cleaned_corpus = [clean_text(text) for text in corpus]

# Create a new df 'cleaned_corpus_df' w/ the cleaned revew texts
cleaned_corpus_df = pd.DataFrame({'Reviews': cleaned_corpus})

# Display the first few rows of the new cleaned df 'cleaned_corpus_df'
print(cleaned_corpus_df.head())

                                             Reviews
0  i feel so lucky to have found this used phone ...
1  nice phone nice up grade from my pantach revue...
2                                       very pleased
3  it works good but it goes slow sometimes but i...
4  great phone to replace my lost phone the only ...


In [None]:
import nltk  # Import the nltk library
nltk.download('punkt')  # Download the required resources for tokenization

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hateo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))  # Set of stopwords in English

def normalize_text(text):
    tokens = word_tokenize(text)  # Tokenize the text into individual words
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords from the tokens
    return ' '.join(tokens)  # Join the tokens back into a single string

# Normalize the cleaned corpus of review texts using the 'normalize_text' function
normalized_corpus = [normalize_text(text) for text in cleaned_corpus]

# Create a new DataFrame 'normalized_corpus_df' with the normalized review texts
normalized_corpus_df = pd.DataFrame({'Reviews': normalized_corpus})


In [None]:
# Save the new normalized df as a csv file
normalized_corpus_df.to_csv('PhoneReviews_normal.csv', index=False)
normalized_corpus_df

Unnamed: 0,Reviews
0,feel lucky found used phone us used hard phone...
1,nice phone nice grade pantach revue clean set ...
2,pleased
3,works good goes slow sometimes good phone love
4,great phone replace lost phone thing volume bu...
5,already phone problems know stated used dang s...
6,charging port loose got soldered needed new ba...
7,phone looks good wouldnt stay charged buy new ...
8,originally using samsung galaxy sprint wanted ...
9,battery life great responsive touch issue some...


In [None]:
import nltk # Import the nltk library
nltk.download('vader_lexicon') # Download the 'vader_lexicon' for sentiment analysis

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hateo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
# Creating an instance of the S.I.A class
sia = SentimentIntensityAnalyzer()

def get_sentiment_score(text): # sentiment scores are calculated
    sentiment = sia.polarity_scores(text) # Calculates the compound sentiment score
    return sentiment['compound']

# 'get_sentiment_score' function used for the 'Reviews' column and is applied to the 'Sentiment Score' column
PhoneReviews_df['Sentiment Score'] = PhoneReviews_df['Reviews'].apply(get_sentiment_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PhoneReviews_df['Sentiment Score'] = PhoneReviews_df['Reviews'].apply(get_sentiment_score)


In [None]:
PhoneReviews_df.to_csv('Sentiments.csv', index=False)