In [None]:
import nltk
nltk.download('punkt')

In [6]:
from nltk.stem import PorterStemmer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [7]:
# load data
df = pd.read_csv('labeledTrainData.tsv', sep='\t', encoding='utf-8')

In [None]:
df.to_dict('records')[0]

In [9]:
texts = df.review.tolist()
sentiment = df.sentiment.tolist()

In [10]:

def preprocess_text(text):
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    preprocessed_text = ' '.join(stemmed_tokens)
    return preprocessed_text

In [11]:
from nltk.tokenize import word_tokenize
from collections import Counter


def space_tokenize(text):
    return word_tokenize(text)
    
    
def get_vocab(texts):
    preprocessed_texts = [preprocess_text(text) for text in texts]
    tokens = [token for text in preprocessed_texts for token in space_tokenize(text)]
    word_counts = Counter(tokens)
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    vocab = [word for word, _ in sorted_words]
    return vocab

class BOWencoder:
    def __init__(self, vocab=None, tokenize=space_tokenize):
        self.vocab = vocab
        self.vocab2idx = self.get_vocab2idx(vocab)
        self.tokenize = tokenize
        
    def encode_single_text(self, text):
        tokens = self.tokenize(text)
        bow_vector = [0] * len(self.vocab)
        for token in tokens:
            if token in self.vocab2idx:
                idx = self.vocab2idx[token]
                bow_vector[idx] += 1          
        return bow_vector
        
    def encode_texts(self, texts):
        encoded_texts = [self.encode_single_text(text) for text in texts]
        return encoded_texts
        
    def get_vocab2idx(self, vocab):
        vocab2idx = {word: idx for idx, word in enumerate(vocab)}
        return vocab2idx


In [12]:
class NaiveBayesClassifier:
    def __init__(self, n_classes):
        self.n_classes = n_classes
        self.class_probs = {}
        self.conditional_probabilities = {}
        self.vocab = vocab
    
    def fit(self, X, y):
        for class_label in range(self.n_classes):
            class_indices = [i for i, label in enumerate(y) if label == class_label]
            class_instances = [X[i] for i in class_indices]
            self.class_probs[class_label] = len(class_instances) / len(y)
    
            conditional_probs = {}  # Словарь для хранения условных вероятностей
            for feature_idx in range(len(self.vocab)):
                feature_prob = (sum(instance[feature_idx] for instance in class_instances) + 1) / (len(class_instances) + 2)  # Пример сглаживания Лапласа
                conditional_probs[feature_idx] = feature_prob
            self.conditional_probabilities[class_label] = conditional_probs

        
    def predict(self, X):
        predictions = []
        for instance in X:
            max_prob = -1
            predicted_class = None
            for class_label in range(self.n_classes):
                class_prob = self.class_probs[class_label]
                conditional_probs = self.conditional_probabilities[class_label]
                instance_prob = 1.0  # Инициализируйте вероятность экземпляра
                for feature_idx, feature_value in enumerate(instance):
                    conditional_prob = conditional_probs.get(feature_idx, 1.0) if feature_value else 1.0
                    instance_prob *= conditional_prob
                class_prob *= instance_prob
                if class_prob > max_prob:
                    max_prob = class_prob
                    predicted_class = class_label
            predictions.append(predicted_class)
        return predictions



In [13]:
X_train, X_test, y_train, y_test = train_test_split(texts, sentiment)
vocab = get_vocab(X_train)
bow_encoder = BOWencoder(vocab=vocab)
X_train = bow_encoder.encode_texts(X_train)
X_test = bow_encoder.encode_texts(X_test)

In [14]:
model = NaiveBayesClassifier(n_classes=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
#вывод полных ревью
def classify_and_print_description(text):
    preprocessed_text = preprocess_text(text)

    text_vector = bow_encoder.encode_single_text(preprocessed_text)

    prediction = model.predict([text_vector])

    if prediction[0] == 1:
        print("Положительное окружение слова 'view':")
    
        positive_description = df[df['sentiment'] == 1]['review'].values[0]
        print(positive_description)
    else:
        print("Отрицательное окружение слова 'view':")
        
        negative_description = df[df['sentiment'] == 0]['review'].values[0]
        print(negative_description)

for text in texts:
    classify_and_print_description(text)


In [None]:
#вывод предложений
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def classify_and_print_description(text):
    sentences = sent_tokenize(text)

    for sentence in sentences:
        if "view" in sentence.lower():
    
            preprocessed_sentence = preprocess_text(sentence)

            sentence_vector = bow_encoder.encode_single_text(preprocessed_sentence)

            prediction = model.predict([sentence_vector])

            if prediction[0] == 1:
                print(f"Положительное окружение слова 'view': {sentence}")
                
            else:
                print(f"Отрицательное окружение слова 'view': {sentence}")
            

for text in texts:
    classify_and_print_description(text)
