In [53]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from textblob import TextBlob

# Load the CSV file into a pandas DataFrame
csv_file_path = 'Data_collection/main_data/Modified_Apple_iPhones_with_all_columns_end.xlsx'
data = pd.read_excel(csv_file_path)

# Drop rows with NaN values in the 'corrected_review' column
data = data.dropna(subset=['review'])

data = data[data.Year == 2018]
data = data[data['Category'] != 'General']
data.head()

Unnamed: 0,Year,Apple iPhone model,review,data source,Category,Sentiment Label,Apple annual revenue($bn),iPhone annual revenue($bn),iPhone annual sales(mm),Active iPhone units in US(mm),...,AirPod annual sales(mm),Mac annual revenue($bn),Mac annual sales(mm),Apple Services annual revenue($bn),Number of subscribers to iOS apps(mm),Apple Pay annual transaction volume(bn),Apple Music annual subscribers(mm),Apple News active users(mm),No. of iPhone users(billion),iPhone’s Market Share(%)
68943,2018,iPhone XR and XS,market’ appetit face id front camera notch all...,Blogs,Camera,Neutral,265.5,166.2,217.7,888,...,35,25.2,18.0,36.9,325,15,40,85,0.888,45.2
68947,2018,iPhone XR and XS,improv popular camera world lay groundwork futur,Blogs,Camera,Positive,265.5,166.2,217.7,888,...,35,25.2,18.0,36.9,325,15,40,85,0.888,45.2
68948,2018,iPhone XR and XS,doubl notch price,Blogs,Price,Neutral,265.5,166.2,217.7,888,...,35,25.2,18.0,36.9,325,15,40,85,0.888,45.2
68951,2018,iPhone XR and XS,iphon xs xs max effect ident specif — notwiths...,Blogs,Battery,Neutral,265.5,166.2,217.7,888,...,35,25.2,18.0,36.9,325,15,40,85,0.888,45.2
68962,2018,iPhone XR and XS,appl fought “plan obsolescence” argument least...,Blogs,Software,Positive,265.5,166.2,217.7,888,...,35,25.2,18.0,36.9,325,15,40,85,0.888,45.2


In [54]:

documents = list(data['review'])


# Calculate sentiment scores for each review
sentiment_scores = [TextBlob(review).sentiment.polarity for review in documents]

data['sentiment_score'] = sentiment_scores

# Create a user-item matrix with reviews as rows and categories as columns
user_item_matrix = data.pivot_table(index='review', columns='Category', values='sentiment_score')

# Fill NaN values with 0 (neutral sentiment)
user_item_matrix = user_item_matrix.fillna(0)

# Collaborative Filtering - User-Based
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

# Matrix Factorization - Singular Value Decomposition (SVD)
svd = TruncatedSVD(n_components=5)  # Use a value less than or equal to the number of features
user_factors = svd.fit_transform(user_item_matrix)
item_factors = svd.components_


In [55]:

# print("User-Based Collaborative Filtering Similarity Matrix:")
# print(user_similarity_df.head())
# print("=" * 40)
# print("Matrix Factorization - User Factors:")
# print(user_factors)
# print("=" * 40)
# print("Matrix Factorization - Item Factors:")
# print(item_factors)
# print("=" * 40)



In [56]:
sentiment_scores_df = pd.DataFrame(data.groupby('Category')['sentiment_score'].mean())
highly_rated_product = sentiment_scores_df.sort_values('sentiment_score', ascending=False).index[0]
lowest_rated_product = sentiment_scores_df.sort_values('sentiment_score', ascending=True).index[0]

print("Highly Rated Product based on Average Sentiment:", highly_rated_product)
print("Lowest Rated Product based on Average Sentiment:", lowest_rated_product)

Highly Rated Product based on Average Sentiment: Price
Lowest Rated Product based on Average Sentiment: Design


In [57]:
sentiment_scores_df

Unnamed: 0_level_0,sentiment_score
Category,Unnamed: 1_level_1
Battery,0.222519
Camera,0.28028
Design,0.163987
Price,0.34113
Software,0.202845


In [58]:
sentiment_scores_df['sentiment_score'].sum()

1.2107601491937823