In [1]:
import re
import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np

In [2]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english')) - {"not"}   # ensure 'not' is kept
NEGATION_TOKENS = {"not", "no", "never", "n't", "none", "nobody", "nothing", "neither",
                   "nowhere", "hardly", "scarcely", "barely"}

In [3]:
def mark_negation_tokens(text):
    tokens = re.findall(r"\w+|[^\w\s]", text.lower(), re.UNICODE)
    negated = False
    out = []
    for tok in tokens:
        if tok in {'.', '!', '?', ';', ':'}:
            negated = False
            out.append(tok)
        elif tok in NEGATION_TOKENS:
            negated = True
            out.append(tok)
        else:
            out.append("NOT_" + tok if negated else tok)
    return " ".join(out)

In [4]:
def preprocess_text_with_negation(text):
    text = re.sub('[^a-zA-Z0-9\s\.\!\?\,\;:\'`-]', ' ', str(text))
    text = mark_negation_tokens(text)
    tokens = []
    for tok in text.split():
        if tok.startswith("NOT_"):
            word = tok[4:]
            if word not in stop_words:
                tokens.append("NOT_" + ps.stem(word))
        else:
            if tok in NEGATION_TOKENS:
                tokens.append(tok)
            elif tok not in stop_words:
                tokens.append(ps.stem(tok))
    return " ".join(tokens)

In [5]:
def split_sentences(text):
    parts = re.split(r'(?<=[.!?])\s+|\n+', text.strip())
    return [p.strip() for p in parts if p.strip()]

In [6]:
aspects_keywords = {
    'food': ['food','taste','flavor','dish','meal','menu','tasty','delicious','bland','salty','sweet','cold','fresh'],
    'service': ['service','staff','waiter','waitress','server','host','manager','attendant','crew','staffed'],
    'speed': ['quick','slow','speed','time','wait','waited','delay','waiting'],
    'hygiene': ['hygiene','clean','dirty','sanitary','unclean','hygienic','cleanliness','smell','odor'],
    'ambience': ['ambience','ambiance','atmosphere','music','decor','lighting','noise','cozy'],
    'price': ['price','cost','expensive','cheap','value','worth','pricey','costly']
}

In [7]:
aspects_regex = {
    asp: re.compile(r'\b(' + r'|'.join(map(re.escape, kws)) + r')\b', flags=re.IGNORECASE)
    for asp, kws in aspects_keywords.items()
}

In [8]:
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)  # ensure file exists
df['Review'] = df['Review'].astype(str)
X_raw = df['Review']
y = df['Liked'].values

In [9]:
X = X_raw.map(preprocess_text_with_negation)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0, stratify=y)

In [11]:
cv = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = cv.fit_transform(X_train)
X_test_tfidf = cv.transform(X_test)

In [12]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [13]:
y_pred = model.predict(X_test_tfidf)
print("Acc:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

Acc: 0.825

Classification report:
               precision    recall  f1-score   support

           0       0.86      0.78      0.82       100
           1       0.80      0.87      0.83       100

    accuracy                           0.82       200
   macro avg       0.83      0.82      0.82       200
weighted avg       0.83      0.82      0.82       200


Confusion matrix:
 [[78 22]
 [13 87]]


In [14]:
labels = ['Not liked', 'Liked']

In [15]:
def predict_text(text):
    """Return label_int, label_str, prob_of_label"""
    proc = preprocess_text_with_negation(text)
    vec = cv.transform([proc])  # sparse
    pred = int(model.predict(vec)[0])
    prob = None
    if hasattr(model, "predict_proba"):
        try:
            probs = model.predict_proba(vec)[0]
            prob = float(probs[pred])
        except Exception:
            prob = None
    return pred, labels[pred], prob

In [16]:
def analyze_with_aspects(text):
    """
    Returns:
      {
        'overall': {'label_int': int, 'label': str, 'probability': float or None},
        'aspects': {
            'food': [ { 'sentence': str, 'label_int': int, 'label': str, 'probability': float or None }, ... ],
            ...
        }
      }
    """
    overall_int, overall_label, overall_prob = predict_text(text)
    res = {
        'overall': {'label_int': overall_int, 'label': overall_label, 'probability': overall_prob},
        'aspects': {}
    }

    sentences = split_sentences(text)
    for s in sentences:
        # find which aspects this sentence mentions
        mentioned = [asp for asp, rx in aspects_regex.items() if rx.search(s)]
        if not mentioned:
            continue
        # predict sentence sentiment (use sentence-level preprocessing)
        s_int, s_label, s_prob = predict_text(s)
        for asp in mentioned:
            res['aspects'].setdefault(asp, []).append({
                'sentence': s,
                'label_int': s_int,
                'label': s_label,
                'probability': s_prob
            })
    return res

In [18]:
# simple_aspect_prediction.py snippet

labels = ['Not liked', 'Liked']

new_data = ["food is not good"]

for review in new_data:

    # Use previously defined helper
    result = analyze_with_aspects(review)

    # ---- Overall sentiment ----
    overall = result['overall']
    label = overall['label']
    prob = overall['probability']

    print(f"Prediction: {label}")
    if prob is not None:
        print(f"Confidence: {prob:.3f}")

    # ---- Extract aspects list ----
    aspects = list(result['aspects'].keys())   # just aspect names

    if aspects:
        print("Aspects:", aspects)
    else:
        print("Aspects: []")

    print("-" * 40)


Prediction: Not liked
Confidence: 0.868
Aspects: ['food']
----------------------------------------
