In [30]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/zinuret/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
from nltk.stem import PorterStemmer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [32]:
# load data
df = pd.read_csv('labeledTrainData.tsv', sep='\t', encoding='utf-8')

In [33]:
df.to_dict('records')[0]

{'id': '5814_8',
 'sentiment': 1,
 'review': "With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />Th

In [34]:
texts = df.review.tolist()
sentiment = df.sentiment.tolist()

In [35]:
from nltk.tokenize import word_tokenize
from collections import Counter


def space_tokenize(text):
    return word_tokenize(text)
    
    
def get_vocab(texts):
    preprocessed_texts = [preprocess_text(text) for text in texts]
    tokens = [token for text in preprocessed_texts for token in space_tokenize(text)]
    word_counts = Counter(tokens)
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    vocab = [word for word, _ in sorted_words]
    return vocab

class BOWencoder:
    def __init__(self, vocab=None, tokenize=space_tokenize):
        self.vocab = vocab
        self.vocab2idx = self.get_vocab2idx(vocab)
        self.tokenize = tokenize
        
    def encode_single_text(self, text):
        tokens = self.tokenize(text)
        bow_vector = [0] * len(self.vocab)
        for token in tokens:
            if token in self.vocab2idx:
                idx = self.vocab2idx[token]
                bow_vector[idx] += 1          
        return bow_vector
        
    def encode_texts(self, texts):
        encoded_texts = [self.encode_single_text(text) for text in texts]
        return encoded_texts
        
    def get_vocab2idx(self, vocab):
        vocab2idx = {word: idx for idx, word in enumerate(vocab)}
        return vocab2idx


In [36]:
class NaiveBayesClassifier:
    def __init__(self, n_classes):
        self.n_classes = n_classes
        self.class_probs = {}
        self.conditional_probabilities = {}
        self.vocab = vocab
    
    def fit(self, X, y):
        for class_label in range(self.n_classes):
            class_indices = [i for i, label in enumerate(y) if label == class_label]
            class_instances = [X[i] for i in class_indices]
            self.class_probs[class_label] = len(class_instances) / len(y)
    
            conditional_probs = {}  # Словарь для хранения условных вероятностей
            for feature_idx in range(len(self.vocab)):
                feature_prob = (sum(instance[feature_idx] for instance in class_instances) + 1) / (len(class_instances) + 2)  # Пример сглаживания Лапласа
                conditional_probs[feature_idx] = feature_prob
            self.conditional_probabilities[class_label] = conditional_probs

        
    def predict(self, X):
        predictions = []
        for instance in X:
            max_prob = -1
            predicted_class = None
            for class_label in range(self.n_classes):
                class_prob = self.class_probs[class_label]
                conditional_probs = self.conditional_probabilities[class_label]
                instance_prob = 1.0  # Инициализируйте вероятность экземпляра
                for feature_idx, feature_value in enumerate(instance):
                    conditional_prob = conditional_probs.get(feature_idx, 1.0) if feature_value else 1.0
                    instance_prob *= conditional_prob
                class_prob *= instance_prob
                if class_prob > max_prob:
                    max_prob = class_prob
                    predicted_class = class_label
            predictions.append(predicted_class)
        return predictions



In [37]:
X_train, X_test, y_train, y_test = train_test_split(texts, sentiment)
vocab = get_vocab(X_train)
bow_encoder = BOWencoder(vocab=vocab)
X_train = bow_encoder.encode_texts(X_train)
X_test = bow_encoder.encode_texts(X_test)

In [38]:
model = NaiveBayesClassifier(n_classes=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [39]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.89      0.82      0.85      3347
           1       0.81      0.88      0.84      2903

    accuracy                           0.85      6250
   macro avg       0.85      0.85      0.85      6250
weighted avg       0.85      0.85      0.85      6250


без стемминга
precision    recall  f1-score   support

           0       0.89      0.78      0.83      3606
           1       0.74      0.87      0.80      2644

    accuracy                           0.82      6250
   macro avg       0.81      0.82      0.81      6250
weighted avg       0.83      0.82      0.82      6250

In [40]:
 # Подключаем стеммер

def preprocess_text(text):
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    preprocessed_text = ' '.join(stemmed_tokens)
    return preprocessed_text
