In [1]:
import json
import string

# Read and parse JSON data into a suitable Python data structure
def load_json(json_file):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]
    return data

def explore_dataset(data):
    # Print information about the dataset
    print("Dataset Information:")
    print(f"Number of Reviews: {len(data)}")
    print(f"Columns: {list(data[0].keys())}")


# Load the JSON data
json_file = 'Cell_Phones_and_Accessories_5.json'
data = load_json(json_file)

# Explore the dataset
explore_dataset(data)

# Define necessary columns for sentiment analysis
necessary_columns = ["reviewText", "overall"]  # Adjust as needed

# Filter the dataset to retain only necessary columns
filtered_data = [{key: entry[key] for key in necessary_columns if key in entry} for entry in data]

# Print the first few entries of the filtered dataset
print("\nFirst few entries of the filtered dataset:")
for entry in filtered_data[:3]:
    print(json.dumps(entry, indent=1))
    



Dataset Information:
Number of Reviews: 194439
Columns: ['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText', 'overall', 'summary', 'unixReviewTime', 'reviewTime']

First few entries of the filtered dataset:
{
 "reviewText": "They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again",
 "overall": 4.0
}
{
 "reviewText": "These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my sister. :)",
 "overall": 5.0
}
{
 "reviewText": "These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!",
 "overall": 5.0
}


In [3]:
#Apply the text preprocessing methods
import requests
import re

def remove_punctuation(text):
    # Remove punctuation using regular expression
    return re.sub(r'[^\w\s]', '', text)

def remove_stop_words(text, stop_words):
    # Remove stop words
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Load stop words from the provided URL
stop_words = set()

with open('stopwords.txt', 'r') as file:
    for line in file:
        stop_words.add(line.rstrip('\n'))

# Step 3: Apply preprocessing to each review text
preprocessed_data = []
for entry in filtered_data:
    review_text = entry.get("reviewText", "")
    if review_text:
        review_text = remove_punctuation(review_text)
        review_text = remove_stop_words(review_text, stop_words)
    entry["reviewText"] = review_text
    preprocessed_data.append(entry)

# Print the first few preprocessed entries
print("First few preprocessed entries:")
for entry in preprocessed_data[:3]:
    print(json.dumps(entry, indent=1))




First few preprocessed entries:
{
 "reviewText": "look stick dont rounded shape bumping Siri kept popping irritating wont buy product",
 "overall": 4.0
}
{
 "reviewText": "stickers review stick stay phone super stylish share sister",
 "overall": 5.0
}
{
 "reviewText": "awesome phone look stylish BELIEVE quality",
 "overall": 5.0
}


In [5]:
# Thematic Analysis
from collections import Counter

def analyze_sentiments(data):
    # Separate positive and negative reviews based on ratings
    positive_reviews = [entry["reviewText"] for entry in preprocessed_data if entry["overall"] in [4, 5]]
    negative_reviews = [entry["reviewText"] for entry in preprocessed_data if entry["overall"] in [1, 2]]

    return {'positive_reviews': positive_reviews, 'negative_reviews': negative_reviews}

def identify_key_phrases(reviews, num_key_phrases=10):
    # Combine all reviews into a single text
    all_text = ' '.join(reviews)
    # Tokenize the text into words
    words = all_text.split()
    # Calculate word frequencies
    word_frequencies = Counter(words)
    # Get the most common key phrases
    key_phrases = word_frequencies.most_common(num_key_phrases)
    return key_phrases

# Example usage
sentiments_data = analyze_sentiments(preprocessed_data)

# Identify key phrases in positive reviews
positive_key_phrases = identify_key_phrases(sentiments_data['positive_reviews'])
print("\nKey Phrases in Positive Reviews:")
print(positive_key_phrases)

# Identify key phrases in negative reviews
negative_key_phrases = identify_key_phrases(sentiments_data['negative_reviews'])
print("\nKey Phrases in Negative Reviews:")
print(negative_key_phrases)




Key Phrases in Positive Reviews:
[('phone', 130556), ('screen', 44621), ('battery', 43580), ('charge', 35775), ('charger', 29804), ('iPhone', 29076), ('product', 27576), ('time', 26938), ('little', 24202), ('dont', 23037)]

Key Phrases in Negative Reviews:
[('phone', 20193), ('screen', 6967), ('product', 5519), ('battery', 5316), ('time', 4449), ('dont', 4363), ('charge', 4346), ('fit', 4114), ('charger', 3448), ('didnt', 3359)]


In [6]:

def rule_based_sentiment_analysis(review_text):
    # Assign weights to positive and negative words
    weights = {
        "bad": -0.2,
        "good": 0.6,
        "amazing": 0.8,
        "excellent": 0.7,
        "terrible": -0.8,
        "awesome": 0.9,
        "beautiful": 0.7,
        "awful": -0.7,
        "poor": -0.5,
        "great": 0.7,
        "love": 0.8,
        "hate": -0.6,
        "like": 0.6,
        "dislike": -0.5,
        "best": 0.8,
        "worst": -0.8}
    # Tokenize the review text into words
    words = review_text.split()

    # Calculate sentiment score based on word weights
    sentiment_score = sum(weights.get(word.lower(), 0) for word in words)

    # Establish a threshold to categorize sentiment
    if sentiment_score > 0.5:
        sentiment_category = 'Positive'
    elif sentiment_score < 0.5:
        sentiment_category = 'Negative'
    else:
        sentiment_category = 'Neutral'

    return sentiment_category, sentiment_score

sentiment_results = []

for entry in preprocessed_data:
    review_text = entry["reviewText"]
    sentiment_label = rule_based_sentiment_analysis(review_text)
    sentiment_results.append({"reviewText": review_text, "sentiment": sentiment_label})

# Print the first few sentiment results
print("First few sentiment results:")
for result in sentiment_results[:10]:
    print(json.dumps(result, indent=1))



First few sentiment results:
{
 "reviewText": "look stick dont rounded shape bumping Siri kept popping irritating wont buy product",
 "sentiment": [
  "Negative",
  0
 ]
}
{
 "reviewText": "stickers review stick stay phone super stylish share sister",
 "sentiment": [
  "Negative",
  0
 ]
}
{
 "reviewText": "awesome phone look stylish BELIEVE quality",
 "sentiment": [
  "Positive",
  0.9
 ]
}
{
 "reviewText": "Item arrived time perfect condition buttons deal included FREE screen protector received deal wouldve nice claim comes",
 "sentiment": [
  "Negative",
  0
 ]
}
{
 "reviewText": "awesome stays looks multiple apple products especially nails helps elevated key",
 "sentiment": [
  "Positive",
  0.9
 ]
}
{
 "reviewText": "using home button easy daughter purchase worth price",
 "sentiment": [
  "Negative",
  0
 ]
}
{
 "reviewText": "described doesnt unstuck cute People driving",
 "sentiment": [
  "Negative",
  0
 ]
}
{
 "reviewText": "week charge phone 20 waste money",
 "sentiment": [
 

In [7]:
# saving the file
output_file_path = "sentiment_results.txt"

# Open the file for writing
with open(output_file_path, "w") as output_file:
    # Write each sentiment result to the file
    for result in sentiment_results:
        output_file.write(f"Review Text: {result['reviewText']}\nSentiment: {result['sentiment']}\n\n")

print("Sentiment results saved to:", output_file_path)

Sentiment results saved to: sentiment_results.txt
