In [40]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [41]:
from google.cloud import bigquery

# Initialize the BigQuery client
client = bigquery.Client()

# Replace with your project ID and BigQuery table details
project_id = "airy-box-431604-j9"
dataset_id = "amazon_reviews"
table_id = "clean_data"

# Construct the full table ID
table_ref = f"{project_id}.{dataset_id}.{table_id}"

# Query the table
query = f"SELECT * FROM `{table_ref}`"

# Execute the query and convert it to a pandas DataFrame
query_job = client.query(query)
df = query_job.to_dataframe()

# Display the DataFrame
df.head()

Unnamed: 0,rating,review_hash,review_text,title,helpful_votes
0,1,[86b980aec71b542df257ba4fe3f67318],With in 2 months it’s not even two months now ...,Battery is very BAD,0
1,1,[005df4a5bf1ded7842a47e7ef36e7524],The worst call quality from JBL and the right ...,"Genuine feedback guys, stay out of this !!!",0
2,1,[bcbed1ed4af799ce4fb942fd899b7325],I use this product 2 day’s but there is i foun...,Don’t purchase,0
3,1,[ee6d4d50e1332d796f1f5007ea297090],When i first received the product the right bu...,Received used product with missing accessories,0
4,1,[11309a2c34969fcd2c0ce1655c55f86a],,Coustomer care,0


In [42]:
df['helpful_votes'] = df['helpful_votes'].astype(int)

In [43]:
df.columns

Index(['rating', 'review_hash', 'review_text', 'title', 'helpful_votes'], dtype='object')

In [44]:
df.drop(columns='review_hash', inplace=True)

In [45]:
# Feature 1: Review Length
df['review_length'] = df['review_text'].apply(len)
df['review_word_count'] = df['review_text'].apply(lambda x: len(word_tokenize(x)))

In [46]:
# Feature 2: Sentiment Score
df['review_sentiment'] = df['review_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['review_subjectivity'] = df['review_text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [47]:
# Feature 3: Readability Score (Flesch-Kincaid Score)
def flesch_kincaid(text):
    words = word_tokenize(text)
    sentences = len(re.split(r'[.!?]', text))
    syllables = sum([len([s for s in word if s in 'aeiou']) for word in words])
    if len(words) == 0 or sentences == 0:
        return np.nan
    return 206.835 - (1.015 * (len(words) / sentences)) - (84.6 * (syllables / len(words)))

df['flesch_kincaid'] = df['review_text'].apply(flesch_kincaid)

In [48]:
# Feature 4: TF-IDF Vectors
vectorizer = TfidfVectorizer(max_features=100)  # Use max_features to limit dimensions
tfidf_matrix = vectorizer.fit_transform(df['review_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df = pd.concat([df, tfidf_df], axis=1)

In [49]:
# Feature 5: Presence of Specific Keywords
keywords = ['good', 'bad', 'recommend', 'disappoint', 'excellent']
for keyword in keywords:
    df[f'keyword_{keyword}'] = df['review_text'].apply(lambda x: int(keyword in x.lower()))

In [50]:
# Feature 6: Rating Deviation (assuming you have an average rating for the product)
# Suppose `avg_rating` is the average rating for the product
avg_rating = df['rating'].mean()
df['rating_deviation'] = df['rating'] - avg_rating

In [51]:
# Feature 7: Title Sentiment
df['title_sentiment'] = df['title'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['title_length'] = df['title'].apply(len)

In [52]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [53]:
# Feature 8: Parts of Speech (POS) Tags
def pos_counts(text):
    words = word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    pos_counts = {"nouns": 0, "verbs": 0, "adjectives": 0}
    for _, tag in pos_tags:
        if tag.startswith('N'):
            pos_counts["nouns"] += 1
        elif tag.startswith('V'):
            pos_counts["verbs"] += 1
        elif tag.startswith('J'):
            pos_counts["adjectives"] += 1
    return pd.Series(pos_counts)

df[['nouns_count', 'verbs_count', 'adjectives_count']] = df['review_text'].apply(pos_counts)

In [54]:
# Feature 9: Negations Count
negations = ["not", "no", "never", "none"]
df['negation_count'] = df['review_text'].apply(lambda x: sum([x.lower().count(neg) for neg in negations]))

In [55]:
# Feature 10: Pronouns Count
pronouns = ["i", "we", "you", "he", "she", "they"]
df['pronoun_count'] = df['review_text'].apply(lambda x: sum([x.lower().count(pronoun) for pronoun in pronouns]))

In [56]:
# Feature 11: Helpful Votes to Review Length Ratio
df['helpful_to_length_ratio'] = df['helpful_votes'] / (df['review_length'] + 1)  # Adding 1 to avoid division by zero

In [57]:
# Review the extracted features
df.head()

Unnamed: 0,rating,review_text,title,helpful_votes,review_length,review_word_count,review_sentiment,review_subjectivity,flesch_kincaid,after,...,keyword_excellent,rating_deviation,title_sentiment,title_length,nouns_count,verbs_count,adjectives_count,negation_count,pronoun_count,helpful_to_length_ratio
0,1,With in 2 months it’s not even two months now ...,Battery is very BAD,0,124,29,-0.188889,0.484722,103.671609,0.0,...,0,-3.085789,-0.91,19,8,7,1,3,8,0.0
1,1,The worst call quality from JBL and the right ...,"Genuine feedback guys, stay out of this !!!",0,159,32,-0.220982,0.767857,85.87,0.0,...,0,-3.085789,0.78125,43,8,3,4,3,11,0.0
2,1,I use this product 2 day’s but there is i foun...,Don’t purchase,0,263,54,0.025,0.5,70.9325,0.0,...,0,-3.085789,0.0,14,10,13,5,7,20,0.0
3,1,When i first received the product the right bu...,Received used product with missing accessories,0,256,49,-0.066071,0.429762,47.298112,0.0,...,0,-3.085789,-0.2,46,12,11,5,3,25,0.0
4,1,,Coustomer care,0,0,0,0.0,0.0,,0.0,...,0,-3.085789,0.0,14,0,0,0,0,0,0.0


In [58]:
df.drop(columns=['review_text', 'title'], inplace=True)

In [59]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [60]:
# Define bins and labels for helpfulness
bins = [0, 1, 5, float("inf")]
labels = ["low", "medium", "high"]
df['helpfulness_class'] = pd.cut(df['helpful_votes'], bins=bins, labels=labels)

# Fill any NaNs that might have been introduced in case of missing values in 'helpful_votes'
df['helpfulness_class'].fillna("low", inplace=True)

# Encode the categorical labels to numeric values
label_encoder = LabelEncoder()
df['helpfulness_class_encoded'] = label_encoder.fit_transform(df['helpfulness_class'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['helpfulness_class'].fillna("low", inplace=True)


In [61]:
# Define features and label
X = df.drop(columns=['helpful_votes', 'helpfulness_class', 'helpfulness_class_encoded'])
y = df['helpfulness_class_encoded']

# Identify column types
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
text_cols = ['review_text', 'title']  # Text columns to vectorize

# Preprocessing for numerical data: Simple scaling
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for text data: TF-IDF Vectorization
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=100))  # Adjust max_features based on memory/performance
])

# Combine preprocessor with column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
    ],
)

In [62]:
X_preprocessed = preprocessor.fit_transform(X)


In [63]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [67]:
# Step 3: Define and fit the classifier
classifier = XGBClassifier(n_estimators=100, max_depth=10, random_state=42)
classifier.fit(X_preprocessed, y)

In [68]:
X_train.shape

(1221, 120)

In [69]:
# Predict on the test data
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=labels)  # Use labels directly

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)

Accuracy: 0.05
Classification Report:
               precision    recall  f1-score   support

         low       0.06      1.00      0.10        16
      medium       0.00      0.00      0.00       272
        high       0.00      0.00      0.00        18

    accuracy                           0.05       306
   macro avg       0.02      0.33      0.03       306
weighted avg       0.00      0.05      0.01       306



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [72]:
import os
from google.cloud import storage

# Set up Google Cloud Storage client
client = storage.Client()

# Define local and GCS paths
local_model_path_v1 = "/tmp/model_v1.bst"
local_model_path_v2 = "/tmp/model_v2.bst"
gcs_model_path_v1 = "gs://amazon-reviews-project/experiment/model_v1/model.bst"
gcs_model_path_v2 = "gs://amazon-reviews-project/experiment/model_v2/model.bst"

# Save model locally
classifier.save_model(local_model_path_v1)
classifier.save_model(local_model_path_v2)

# Function to upload to Google Cloud Storage
def upload_to_gcs(local_path, gcs_path):
    bucket_name, blob_name = gcs_path[5:].split("/", 1)
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(local_path)
    print(f"Uploaded {local_path} to {gcs_path}")

# Upload models to Google Cloud Storage
upload_to_gcs(local_model_path_v1, gcs_model_path_v1)
upload_to_gcs(local_model_path_v2, gcs_model_path_v2)


Uploaded /tmp/model_v1.bst to gs://amazon-reviews-project/experiment/model_v1/model.bst
Uploaded /tmp/model_v2.bst to gs://amazon-reviews-project/experiment/model_v2/model.bst


In [73]:
import google.cloud.aiplatform as aip

In [91]:
DEPLOY_IMAGE = "us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1.7:latest"

In [92]:
model_v1 = aip.Model.upload(
    display_name="example_",
    artifact_uri="gs://amazon-reviews-project/experiment/model_v1",
    serving_container_image_uri=DEPLOY_IMAGE,
    is_default_version=True,
    version_aliases=["v3"],
    version_description="This is the first version of the model",
)

print(model_v1)

Creating Model
Create Model backing LRO: projects/62430491516/locations/us-central1/models/4657044171608031232/operations/3711990459833253888
Model created. Resource name: projects/62430491516/locations/us-central1/models/4657044171608031232@1
To use this Model in another session:
model = aiplatform.Model('projects/62430491516/locations/us-central1/models/4657044171608031232@1')
<google.cloud.aiplatform.models.Model object at 0x7eff993254b0> 
resource name: projects/62430491516/locations/us-central1/models/4657044171608031232


In [93]:
model_v2 = aip.Model.upload(
    display_name="example_",
    artifact_uri="gs://amazon-reviews-project/experiment/model_v2",
    serving_container_image_uri=DEPLOY_IMAGE,
    parent_model=model_v1.resource_name,
    is_default_version=True,
    version_aliases=["v4"],
    version_description="This is the second version of the model",
)

print(model_v2)

Creating Model
Create Model backing LRO: projects/62430491516/locations/us-central1/models/4657044171608031232/operations/4539245416385871872
Model created. Resource name: projects/62430491516/locations/us-central1/models/4657044171608031232@2
To use this Model in another session:
model = aiplatform.Model('projects/62430491516/locations/us-central1/models/4657044171608031232@2')
<google.cloud.aiplatform.models.Model object at 0x7eff99357fa0> 
resource name: projects/62430491516/locations/us-central1/models/4657044171608031232


In [94]:
PROJECT_ID = "airy-box-431604-j9"

In [95]:
endpoint = aip.Endpoint.create(
    display_name="example_",
    project=PROJECT_ID,
    location="us-central1",
)

print(endpoint)

Creating Endpoint
Create Endpoint backing LRO: projects/62430491516/locations/us-central1/endpoints/3836557808636002304/operations/9160783058998132736
Endpoint created. Resource name: projects/62430491516/locations/us-central1/endpoints/3836557808636002304
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/62430491516/locations/us-central1/endpoints/3836557808636002304')
<google.cloud.aiplatform.models.Endpoint object at 0x7eff993277c0> 
resource name: projects/62430491516/locations/us-central1/endpoints/3836557808636002304


In [96]:
DEPLOY_COMPUTE = "n1-standard-4"
print("Train machine type", DEPLOY_COMPUTE)

Train machine type n1-standard-4


In [97]:
versions = model_v2.versioning_registry.list_versions()
for version in versions:
    print(version)

Getting versions for projects/62430491516/locations/us-central1/models/4657044171608031232
VersionInfo(version_id='1', version_create_time=DatetimeWithNanoseconds(2024, 11, 4, 4, 17, 27, 914723, tzinfo=datetime.timezone.utc), version_update_time=DatetimeWithNanoseconds(2024, 11, 4, 4, 17, 35, 469811, tzinfo=datetime.timezone.utc), model_display_name='example_', model_resource_name='projects/62430491516/locations/us-central1/models/4657044171608031232', version_aliases=['v3'], version_description='This is the first version of the model')
VersionInfo(version_id='2', version_create_time=DatetimeWithNanoseconds(2024, 11, 4, 4, 17, 34, 535857, tzinfo=datetime.timezone.utc), version_update_time=DatetimeWithNanoseconds(2024, 11, 4, 4, 17, 35, 469811, tzinfo=datetime.timezone.utc), model_display_name='example_', model_resource_name='projects/62430491516/locations/us-central1/models/4657044171608031232', version_aliases=['v4', 'default'], version_description='This is the second version of the m

In [98]:
models = aip.Model.list(filter="display_name=example_")
print("Number of models:", len(models))
print("Version ID:", models[0].version_id)

model = models[0]

Number of models: 2
Version ID: 2


In [99]:
model

<google.cloud.aiplatform.models.Model object at 0x7eff993260e0> 
resource name: projects/62430491516/locations/us-central1/models/4657044171608031232

In [100]:
response = endpoint.deploy(
    model=model,
    deployed_model_display_name="example_",
    machine_type=DEPLOY_COMPUTE,
)

print(endpoint)

Deploying Model projects/62430491516/locations/us-central1/models/4657044171608031232 to Endpoint : projects/62430491516/locations/us-central1/endpoints/3836557808636002304


InvalidArgument: 400 Invalid image "us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1.7:latest" for deployment. Please use a Model with a valid image.