<a href="https://colab.research.google.com/github/mayeem-research/customer_review_ml_model./blob/main/customer_review_ml_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing necessary libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Embedding, GlobalMaxPooling1D, Dense
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.linear_model import LogisticRegression
import numpy as np  # ✅ Import NumPy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

file_url = '/content/drive/MyDrive/Colab Notebooks/play_review.csv'

try:
    # Load the dataset
    df = pd.read_csv(file_url, on_bad_lines='skip')
    print("Dataset Sample:")
    print(df.head())

    # Check if 'review' column exists
    if 'review' not in df.columns:
        raise KeyError("The dataset must contain a 'review' column for text data.")

    # Add a simple 'sentiment' column if it doesn't exist
    if 'sentiment' not in df.columns:
        print("The 'sentiment' column is missing. Creating a sample 'sentiment' column.")
        df['sentiment'] = df['review'].apply(lambda x: 'positive' if 'good' in str(x).lower() or 'love' in str(x).lower() else 'negative')

    # Split the dataset into training and testing sets
    train_reviews, test_reviews, train_labels, test_labels = train_test_split(
        df['review'],
        df['sentiment'],
        test_size=0.2,  # 20% of the data will be used for testing
        random_state=42  # Seed for reproducibility
    )
    print("Data split successfully.")

except FileNotFoundError:
    print(f"File not found: {file_url}. Please check the file path.")
except KeyError as e:
    print(f"KeyError: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Dataset Sample:
                                           T75of src                reviewer  \
0  https://play-lh.googleusercontent.com/a-/ALV-U...           Manuel Mintah   
1  https://play-lh.googleusercontent.com/a/ACg8oc...   Eliasito Markmilliano   
2  https://play-lh.googleusercontent.com/a/ACg8oc...  Issahaku Mohammed-Awal   
3  https://play-lh.googleusercontent.com/a-/ALV-U...          Andy O. Appiah   
4  https://play-lh.googleusercontent.com/a-/ALV-U...   Michael Anyetei Adjei   

                 date                                             review  \
0   December 24, 2023  I have personally experienced the power of thi...   
1  September 22, 2023  This new GCB app has left quite an impression ...   
2    November 5, 2023  I think you deserve a 5 star. At first, I coul...   
3   December 14, 2023  I have installed this app many time in differe...   
4  September 18, 2023  This has to be one of the best banking apps ou...   

                                 AJTPZc       

######################### Pre-processing #########################

In [None]:



# Example Training Data
train_reviews = ["Great product!", "Terrible service.", "Loved it!", "Worst experience ever."] * 84  # Now 336 samples
y_train = [1, 0, 1, 0] * 84  # 336 labels (same as train_reviews)

test_reviews = ["Amazing!", "Horrible."]

# Convert Text to TF-IDF Vectors
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_reviews)
X_test = vectorizer.transform(test_reviews)

# Ensure matching sizes
print(f"X_train samples: {X_train.shape[0]}")
print(f"y_train samples: {len(y_train)}")

# Train Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make Predictions
predictions = model.predict(X_test)
print(predictions)  # Expected Output: [1 0] (Depends on training data)


X_train samples: 336
y_train samples: 336
[1 1]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


######################## Model Development #########################


In [None]:

# ✅ Define train and test reviews with equal sample size
train_reviews = ["Great product!", "Terrible service.", "Loved it!", "Worst experience ever."] * 84
test_reviews = ["Amazing!", "Horrible.", "Loved it.", "Never again!"] * 21

# ✅ Ensure labels match the review count
y_train = np.array([1, 0, 1, 0] * 84)  # 336 labels
y_test = np.array([1, 0, 1, 0] * 21)   # 84 labels

# ✅ Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_reviews)

X_train_seq = tokenizer.texts_to_sequences(train_reviews)
X_test_seq = tokenizer.texts_to_sequences(test_reviews)

# ✅ Pad sequences
X_train_padded = pad_sequences(X_train_seq, maxlen=200, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=200, padding='post', truncating='post')

# ✅ Check data shape
print(f"X_train shape: {X_train_padded.shape}, y_train shape: {y_train.shape}")  # Ensure they match

# ✅ Define the model
model = Sequential([
    Embedding(input_dim=10000, output_dim=32, input_length=200),
    GlobalMaxPooling1D(),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# ✅ Checkpoint callback
checkpoint = ModelCheckpoint('model.h5.keras', monitor='val_loss', save_best_only=True)

# ✅ Train the model
history = model.fit(X_train_padded, y_train,
                    epochs=10, batch_size=32,
                    validation_data=(X_test_padded, y_test),
                    callbacks=[checkpoint])


X_train shape: (336, 200), y_train shape: (336,)




Epoch 1/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 61ms/step - accuracy: 0.9188 - loss: 0.6696 - val_accuracy: 0.7500 - val_loss: 0.6719
Epoch 2/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 1.0000 - loss: 0.6059 - val_accuracy: 0.7500 - val_loss: 0.6564
Epoch 3/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.5460 - val_accuracy: 0.7500 - val_loss: 0.6410
Epoch 4/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 1.0000 - loss: 0.4839 - val_accuracy: 0.7500 - val_loss: 0.6253
Epoch 5/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 1.0000 - loss: 0.4237 - val_accuracy: 0.7500 - val_loss: 0.6096
Epoch 6/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 1.0000 - loss: 0.3641 - val_accuracy: 0.7500 - val_loss: 0.5940
Epoch 7/10
[1m11/11[0m [32m━━━━

######################### Model Evaluation #########################


In [None]:
# Ensure test data is properly tokenized and padded
y_pred = model.predict(X_test_padded)  # ✅ Use Padded Test Data
y_pred = (y_pred > 0.5).astype(int)

# Debugging: Check unique predictions
print("Unique predictions in y_pred:", set(y_pred.flatten()))

# Performance Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=1)
rec = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)
model.save("sentiment_model.h5")
model.save('my_model.keras')

print(f"Accuracy: {acc}, Precision: {prec}, Recall: {rec}, F1-score: {f1}")


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step




Unique predictions in y_pred: {1}
Accuracy: 0.5, Precision: 0.5, Recall: 1.0, F1-score: 0.6666666666666666


Metrices


In [None]:
# Print performance metrics
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', rec)
print('F1 Score:', f1)


Accuracy: 0.5
Precision: 0.5
Recall: 1.0
F1 Score: 0.6666666666666666
