In [4]:
!pip install pandas nltk spacy scikit-learn surprise matplotlib seaborn boto3 sagemaker
!python -m spacy download en_core_web_sm

Collecting spacy
  Using cached spacy-3.7.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting sagemaker
  Using cached sagemaker-2.224.2-py3-none-any.whl.metadata (15 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.10-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Using cached preshed-3.0.9-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting thin

In [5]:
# Imports
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define the file path to the dataset in S3
s3_bucket = 'electronics-dataset'
file_key = 'DatafinitiElectronicsProductData.csv'
s3_path = f's3://{s3_bucket}/{file_key}'

# Load the dataset from S3
data = pd.read_csv(s3_path)

# Handling Missing Values
data_cleaned = data.drop(columns=['colors', 'dimension', 'ean', 'manufacturer'])
data_cleaned = data_cleaned.dropna(subset=['reviews.text', 'reviews.title'])
data_cleaned['reviews.date'].fillna('Unknown', inplace=True)
data_cleaned = data_cleaned.dropna(subset=['reviews.rating'])
data_cleaned['reviews.doRecommend'].fillna(data_cleaned['reviews.doRecommend'].mode()[0], inplace=True)
data_cleaned['reviews.numHelpful'].fillna(data_cleaned['reviews.numHelpful'].median(), inplace=True)

# Data Type Conversion
data_cleaned['dateAdded'] = pd.to_datetime(data_cleaned['dateAdded'], errors='coerce')
data_cleaned['dateUpdated'] = pd.to_datetime(data_cleaned['dateUpdated'], errors='coerce')
data_cleaned['reviews.rating'] = pd.to_numeric(data_cleaned['reviews.rating'], errors='coerce')

# Duplicate Removal
data_cleaned = data_cleaned.drop_duplicates()

# Standardize Text Data
data_cleaned['reviews.text'] = data_cleaned['reviews.text'].str.lower().str.strip()
data_cleaned['reviews.title'] = data_cleaned['reviews.title'].str.lower().str.strip()
data_cleaned['brand'] = data_cleaned['brand'].str.lower().str.strip()
data_cleaned['categories'] = data_cleaned['categories'].str.lower().str.strip()

# Outlier Detection and Handling
def cap_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = df[col].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))

cap_outliers(data_cleaned, 'reviews.numHelpful')

# Feature Engineering
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

data_cleaned['processed_reviews'] = data_cleaned['reviews.text'].apply(preprocess_text)

# Review Length Feature
data_cleaned['review_length'] = data_cleaned['reviews.text'].apply(len)


def sentiment_from_rating(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

data_cleaned['sentiment'] = data_cleaned['reviews.rating'].apply(sentiment_from_rating)
data_cleaned['word_count'] = data_cleaned['reviews.text'].apply(lambda x: len(x.split()))

# Number of Reviews Per Product
data_cleaned['num_reviews'] = data_cleaned.groupby('name')['reviews.text'].transform('count')

# Save the cleaned data
cleaned_data_path = 'cleaned_data_with_features.csv'
data_cleaned.to_csv(cleaned_data_path, index=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
pip install pandas nltk spacy scikit-learn surprise matplotlib seaborn boto3 sagemaker fsspec s3fs


Collecting fsspec
  Using cached fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting s3fs
  Using cached s3fs-2024.6.1-py3-none-any.whl.metadata (1.6 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Using cached aiobotocore-2.13.1-py3-none-any.whl.metadata (22 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from s3fs)
  Using cached aiohttp-3.9.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting botocore<1.35.0,>=1.34.19 (from boto3)
  Using cached botocore-1.34.131-py3-none-any.whl.metadata (5.7 kB)
Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Using cached aioitertools-0.11.0-py3-none-any.whl.metadata (3.3 kB)
Collecting aiosignal>=1.1.2 (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs)
  Using cached aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist>=1.1.1 (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs)
  Using cached frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x8