In [None]:
import os
import random
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
nltk.download('stopwords')
nltk.download('punkt_tab')

# --- آماده‌سازی داده‌ها ---
def load_documents(folder):
    docs = []
    stop_words = set(stopwords.words('english'))
    translator = str.maketrans('', '', punctuation)
    
    for file in os.listdir(folder):
        with open(os.path.join(folder, file), encoding='utf-8') as f:
            tokens = word_tokenize(f.read())
            tokens = [w.translate(translator) for w in tokens if w.lower() not in stop_words]
            docs.append(' '.join(tokens))
    return docs

neg_docs = load_documents('data/neg')
pos_docs = load_documents('data/pos')

# --- تقسیم داده‌ها ---
random.shuffle(neg_docs)
random.shuffle(pos_docs)

X_train = neg_docs[:800] + pos_docs[:800]
y_train = [0]*800 + [1]*800

X_test = neg_docs[800:] + pos_docs[800:]
y_test = [0]*200 + [1]*200

# --- بردار TF-IDF و آموزش مدل ---
vectorizer = TfidfVectorizer(max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vec, y_train)

# --- ارزیابی ---
y_pred = model.predict(X_test_vec)
print("Test Accuracy:", round(accuracy_score(y_test, y_pred)*100, 2), "%")

# --- پیش‌بینی نمونه ---
sample = vectorizer.transform(["this product is terrible and I hate it"])
prediction = model.predict(sample)[0]

if prediction == 0:
    print("نتیجه شد 0 → نظر منفی است ❌")
else:
    print("نتیجه شد 1 → نظر مثبت است ✅")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maleek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\maleek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
