In [20]:
import pandas as pd
import numpy as np
import fasttext
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_class_weight
import re
import pickle

In [2]:
fasttext_model = fasttext.load_model('cc.bn.300.bin')

In [3]:
df = pd.read_csv('bangla_food_review.csv')
df.shape

(3339, 2)

In [4]:
def preprocess(sentence):
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE
    )
    sentence = emoji_pattern.sub(r'', sentence)
    sentence = re.sub(r'\s*\n\s*', ' ',sentence)
    sentence = re.sub(r'\s+', ' ',sentence)
    sentence = re.sub(r'[#,।]', '', sentence)
    return sentence.strip().lower()

In [5]:
df['processed_text'] = df['Review'].apply(preprocess)

In [6]:
# Convert text to FastText embeddings
def text_to_embedding(text, model, embedding_dim=300):
    words = text.split()
    embeddings = [model.get_word_vector(word) for word in words if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embedding_dim)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df["processed_text"], df["Label"], test_size=0.2, random_state=42,stratify=df["Label"])

In [8]:
# Apply embeddings transformation with progress notification
X_train_embeddings = []
for i, text in enumerate(X_train):
    X_train_embeddings.append(text_to_embedding(text, fasttext_model))
    if (i + 1) % 100 == 0:
        print(f'Processed {i + 1} texts')

X_train_embeddings = np.array(X_train_embeddings)
y = y_train

Processed 100 texts
Processed 200 texts
Processed 300 texts
Processed 400 texts
Processed 500 texts
Processed 600 texts
Processed 700 texts
Processed 800 texts
Processed 900 texts
Processed 1000 texts
Processed 1100 texts
Processed 1200 texts
Processed 1300 texts
Processed 1400 texts
Processed 1500 texts
Processed 1600 texts
Processed 1700 texts
Processed 1800 texts
Processed 1900 texts
Processed 2000 texts
Processed 2100 texts
Processed 2200 texts
Processed 2300 texts
Processed 2400 texts
Processed 2500 texts
Processed 2600 texts


In [13]:
X_test_embeddings = np.array([text_to_embedding(text, fasttext_model) for text in X_test])

In [10]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

In [14]:
model = SVC(class_weight=class_weights_dict)
model.fit(X_train_embeddings, y_train)

In [15]:
y_pred = model.predict(X_test_embeddings)
y_pred

array([0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,

In [16]:
df_results = pd.DataFrame({
    'Review': X_test,
    'Actual_Label': y_test,
    'Predicted_Label': y_pred
})
df_results.to_csv('predictions.csv', index=False)

In [17]:
# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

Accuracy: 0.7844311377245509
              precision    recall  f1-score   support

           0       0.72      0.75      0.74       268
           1       0.83      0.81      0.82       400

    accuracy                           0.78       668
   macro avg       0.78      0.78      0.78       668
weighted avg       0.79      0.78      0.78       668



In [18]:
X_embeddings = np.concatenate((X_train_embeddings, X_test_embeddings), axis=0)
y_ = np.concatenate((y_train, y_test), axis=0)

skf = StratifiedKFold(n_splits=5)
scores = cross_val_score(model, X_embeddings, y_, cv=skf, scoring='accuracy')

print(f'Cross-validated accuracy scores: {scores}')
print(f'Mean accuracy: {scores.mean()}')

Cross-validated accuracy scores: [0.79341317 0.77994012 0.77245509 0.81437126 0.7826087 ]
Mean accuracy: 0.7885576672741473


In [19]:
# Example text for prediction
new_text = '''ঘুরে এলাম পুরান ঢাকার সবচেয়ে কালারফুল rooftop restaurant প্রোফাইল জোন স্মোকিং জোন কাপল জোন প্রাইভেট পার্টি জোন ফুল হল পার্টির জন্য এসি লাউঞ্জ রুফটপ ক্যান্ডেল লাইট পার্টি জোনসবকিছুই এক রেস্টুরেন্টে আপনি পেয়ে যাবেন তাছাড়া দুপুরে আর রাতে পার্টি বুক করলে ১৫% এর মতন ডিসকাউন্ট পাবেন পার্টি ছাড়াও ১২% ডিসকাউন্ট পাবেন যেকোন খাবারে মেম্বারশিপ কার্ডের জন্য the dining lounge এর সবচেয়ে ইউনিক আইটেম ডিপ ডিশ পিজ্জা ( বাংলাদেশের গর্ব ) এখানে এসে এই পিজ্জা না খেলে আপনার জীবনটাই বৃথা হবে এই পিজ্জা বাংলাদেশে তারাই এনেছে প্রথম বউকে নিয়ে তাই খুব সুন্দর সময় কাটালাম কাপল জোনে সাথে ভালো খাওয়া দাওয়া হলো tikka deep dish pizza12"" the dining lounge wari 2.5kg8/1 rankin street wari just opposite to shawpno."'''

# Preprocess the new text
new_text_processed = preprocess(new_text)

# Convert the new text to FastText embedding
new_text_embedding = text_to_embedding(new_text_processed, fasttext_model).reshape(1, -1)

# Make a prediction
prediction = model.predict(new_text_embedding)

# Output the prediction
print(f'Prediction: {prediction[0]}')

Prediction: 1


In [22]:
# Save the model using pickle
with open('Restaurant_review_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model saved successfully as 'Restaurant_review_model.pkl' using pickle.")

Model saved successfully as 'Restaurant_review_model.pkl' using pickle.
