In [68]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import plotly.express as px

In [78]:



# Load the CSV file into a pandas DataFrame
csv_file_path = 'Data_collection/main_data/apple_raw_reivew.csv'
data = pd.read_csv(csv_file_path)

data = data[data.Year == 2021]
# Drop rows with NaN values in the 'corrected_review' column
data = data.dropna(subset=['review'])

# Convert the processed reviews to a list of strings
documents = list(data['review'])

# Calculate sentiment scores for each review
sentiment_scores = [TextBlob(review).sentiment.polarity for review in documents]

# Separate reviews into positive and negative based on sentiment scores
positive_reviews = [review for review, score in zip(documents, sentiment_scores) if score > 0]
negative_reviews = [review for review, score in zip(documents, sentiment_scores) if score < 0]

# Create a CountVectorizer to extract frequent features from positive reviews
positive_vectorizer = CountVectorizer(max_features=100, max_df=0.8, stop_words='english')
positive_X = positive_vectorizer.fit_transform(positive_reviews)

# Create a CountVectorizer to extract frequent features from negative reviews
negative_vectorizer = CountVectorizer(max_features=100, max_df=0.8, stop_words='english')
negative_X = negative_vectorizer.fit_transform(negative_reviews)

# Get the feature names for positive and negative reviews
positive_feature_names = positive_vectorizer.get_feature_names_out()
negative_feature_names = negative_vectorizer.get_feature_names_out()

# Display the most frequent positive and negative features
print("Most Frequent Positive Features:")
print(positive_feature_names)
print("=" * 40)
print("Most Frequent Negative Features:")
print(negative_feature_names)


Most Frequent Positive Features:
['12' '13' '14' '15' '5g' 'a15' 'amazing' 'android' 'app' 'apple'
 'available' 'awesome' 'battery' 'best' 'better' 'big' 'bionic' 'buy'
 'camera' 'cameras' 'charging' 'cinematic' 'day' 'delivery' 'design'
 'device' 'display' 'don' 'excellent' 'experience' 'fast' 'feature'
 'features' 'flipkart' 'good' 'got' 'great' 'happy' 'high' 'image' 'ios'
 'iphone' 'iphones' 'just' 'larger' 'latest' 'lens' 'life' 'light' 'like'
 'll' 'look' 'love' 'loved' 'low' 'make' 'max' 'mini' 'mobile' 'mode'
 'model' 'models' 'need' 'new' 'nice' 'night' 'notch' 'offers' 'overall'
 'people' 'performance' 'phone' 'phones' 'photo' 'photos' 'price' 'pro'
 'product' 'quality' 'really' 'review' 'screen' 'sensor' 'size'
 'smartphone' 'super' 'superb' 'thanks' 'time' 'ultra' 'use' 'used'
 'using' 've' 'video' 'want' 'wide' 'worth' 'year' 'years']
Most Frequent Negative Features:
['12' '128gb' '12mp' '13' '14' '15' '2023' 'a15' 'android' 'aperture'
 'app' 'apple' 'average' 'bad' 'base'

In [79]:
negative_reviews[355]

'There are a few weak spots, however, including the lack of an always-on display, an unreliable notification system, and app bloat.'

In [80]:
positive_feature_names[0]

'12'

In [81]:
elements_to_remove = ['iphone', 'apple', 'phone', 'new', 'like', 'just', 'really', 'time',
                      'got', 'use', 'know', 'don', 've', 'want', 'make', 'think', 'way', 'need', 'good',
                      'better', 'thing', 'year','14','pro','max','mini','13','12','11','10','9','8','7','6','5','4','3',
                      'plus','best','screen','great','people','phones','years','light','feature']

In [82]:
# Sum the counts of each feature in positive reviews
positive_word_counts = positive_X.sum(axis=0)

# Sum the counts of each feature in negative reviews
negative_word_counts = negative_X.sum(axis=0)

# Convert the word counts to a list
positive_word_counts = positive_word_counts.tolist()[0]
negative_word_counts = negative_word_counts.tolist()[0]

# Create a dictionary to map words to their counts
positive_word_count_dict = dict(zip(positive_feature_names, positive_word_counts))
negative_word_count_dict = dict(zip(negative_feature_names, negative_word_counts))


In [83]:
positive_word_count_dict = {key: value for key, value in positive_word_count_dict.items() if key not in elements_to_remove}
negative_word_count_dict = {key: value for key, value in negative_word_count_dict.items() if key not in elements_to_remove}

In [84]:
positive_percentage = len(positive_reviews)/(len(positive_reviews)+len(negative_reviews)) * 100
negative_percentage = len(negative_reviews)/(len(positive_reviews)+len(negative_reviews)) * 100


In [85]:
# Create a DataFrame for the pie chart
data = pd.DataFrame({'Sentiment': ['Positive', 'Negative'], 'Percentage': [positive_percentage, negative_percentage]})

# Create the pie plot
fig = px.pie(data, names='Sentiment', values='Percentage', title='Sentiment Distribution')

# Show the plot
fig.show()

In [55]:
# Create a DataFrame from the positive_word_count_dict
df_positive_word_counts = pd.DataFrame.from_dict(positive_word_count_dict, orient='index', columns=['Count'])

# Sort the DataFrame by word count in descending order
df_positive_word_counts = df_positive_word_counts.sort_values(by='Count', ascending=False).head(20)

# Calculate the percentage of each word count with respect to the total word count
total_positive_word_count = df_positive_word_counts['Count'].sum()
df_positive_word_counts['Percentage'] = (df_positive_word_counts['Count'] / total_positive_word_count) * 100

# Take the top 10 positive features
top_10_positive_features = df_positive_word_counts

# Display the DataFrame with the top 10 positive features and their percentages
print("Top 10 Positive Features and Their Percentages:")
print(top_10_positive_features)

Top 10 Positive Features and Their Percentages:
             Count  Percentage
camera        1480   14.399689
battery       1030   10.021405
display        727    7.073361
video          645    6.275540
features       553    5.380424
life           513    4.991243
models         457    4.446390
ios            456    4.436661
upgrade        446    4.339366
performance    414    4.028021
mode           385    3.745865
dynamic        375    3.648570
buy            363    3.531816
watch          361    3.512356
photos         359    3.492897
island         347    3.376143
16             345    3.356684
iphones        342    3.327496
design         341    3.317766
love           339    3.298307


In [56]:
import pandas as pd

# Create a DataFrame from the negative_word_count_dict
df_negative_word_counts = pd.DataFrame.from_dict(negative_word_count_dict, orient='index', columns=['Count'])

# Sort the DataFrame by word count in descending order
df_negative_word_counts = df_negative_word_counts.sort_values(by='Count', ascending=False).head(20)

# Calculate the percentage of each word count with respect to the total word count
total_negative_word_count = df_negative_word_counts['Count'].sum()
df_negative_word_counts['Percentage'] = (df_negative_word_counts['Count'] / total_negative_word_count) * 100

# Take the top 10 negative features
top_10_negative_features = df_negative_word_counts

# Display the DataFrame with the top 10 negative features and their percentages
print("Top 10 Negative Features and Their Percentages:")
print(top_10_negative_features)


Top 10 Negative Features and Their Percentages:
         Count  Percentage
game       390   14.606742
battery    291   10.898876
camera     184    6.891386
playing    149    5.580524
video      137    5.131086
display    123    4.606742
models     122    4.569288
ios        116    4.344569
life       115    4.307116
bad        108    4.044944
upgrade    100    3.745318
little      99    3.707865
base        99    3.707865
buy         99    3.707865
15          94    3.520599
model       94    3.520599
long        92    3.445693
island      91    3.408240
day         84    3.146067
using       83    3.108614
