In [77]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from textblob import TextBlob

# Load the CSV file into a pandas DataFrame
csv_file_path = 'Data_collection/main_data/Modified_Apple_iPhones_with_all_columns_end.xlsx'
data = pd.read_excel(csv_file_path)

# Drop rows with NaN values in the 'corrected_review' column
data = data.dropna(subset=['review'])

data = data[data.Year == 2020]
data = data[data['Category'] != 'General']
data.head()

Unnamed: 0,Year,Apple iPhone model,review,data source,Category,Sentiment Label,Apple annual revenue($bn),iPhone annual revenue($bn),iPhone annual sales(mm),Active iPhone units in US(mm),...,AirPod annual sales(mm),Mac annual revenue($bn),Mac annual sales(mm),Apple Services annual revenue($bn),Number of subscribers to iOS apps(mm),Apple Pay annual transaction volume(bn),Apple Music annual subscribers(mm),Apple News active users(mm),No. of iPhone users(billion),iPhone’s Market Share(%)
47410,2020,iPhone 12 series,good camera key sell point smartphon day – som...,Blogs,Camera,Positive,274.5,137.7,196.9,1042,...,114,28.4,20.2,53.6,620,90,72,100,1.0,46.9
47414,2020,iPhone 12 series,iphon 13 pro max apple’ websit iphon 13 pro ma...,Blogs,Price,Positive,274.5,137.7,196.9,1042,...,114,28.4,20.2,53.6,620,90,72,100,1.0,46.9
47417,2020,iPhone 12 series,spend day four iphon favourit iphon 13 pro bat...,Blogs,Camera,Positive,274.5,137.7,196.9,1042,...,114,28.4,20.2,53.6,620,90,72,100,1.0,46.9
47421,2020,iPhone 12 series,elsewher lightn charg port silenc volum power ...,Blogs,Camera,Neutral,274.5,137.7,196.9,1042,...,114,28.4,20.2,53.6,620,90,72,100,1.0,46.9
47422,2020,iPhone 12 series,updat later date i’v time put pace properli – ...,Blogs,Camera,Positive,274.5,137.7,196.9,1042,...,114,28.4,20.2,53.6,620,90,72,100,1.0,46.9


In [78]:

documents = list(data['review'])


# Calculate sentiment scores for each review
sentiment_scores = [TextBlob(review).sentiment.polarity for review in documents]

data['sentiment_score'] = sentiment_scores

# Create a user-item matrix with reviews as rows and categories as columns
user_item_matrix = data.pivot_table(index='review', columns='Category', values='sentiment_score')

# Fill NaN values with 0 (neutral sentiment)
user_item_matrix = user_item_matrix.fillna(0)

# Collaborative Filtering - User-Based
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

# Matrix Factorization - Singular Value Decomposition (SVD)
svd = TruncatedSVD(n_components=5)  # Use a value less than or equal to the number of features
user_factors = svd.fit_transform(user_item_matrix)
item_factors = svd.components_


In [79]:

# print("User-Based Collaborative Filtering Similarity Matrix:")
# print(user_similarity_df.head())
# print("=" * 40)
# print("Matrix Factorization - User Factors:")
# print(user_factors)
# print("=" * 40)
# print("Matrix Factorization - Item Factors:")
# print(item_factors)
# print("=" * 40)



In [80]:
sentiment_scores_df = pd.DataFrame(data.groupby('Category')['sentiment_score'].mean())
highly_rated_product = sentiment_scores_df.sort_values('sentiment_score', ascending=False).index[0]
lowest_rated_product = sentiment_scores_df.sort_values('sentiment_score', ascending=True).index[0]

print("Highly Rated Product based on Average Sentiment:", highly_rated_product)
print("Lowest Rated Product based on Average Sentiment:", lowest_rated_product)

Highly Rated Product based on Average Sentiment: Price
Lowest Rated Product based on Average Sentiment: Software


In [81]:
sentiment_scores_df

Unnamed: 0_level_0,sentiment_score
Category,Unnamed: 1_level_1
Battery,0.144073
Camera,0.195078
Design,0.15861
Price,0.267648
Software,0.102444


In [82]:
sentiment_scores_df['sentiment_score'].sum()

0.867853246687638