In [1]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from datasets import load_dataset

In [3]:
ds = load_dataset("stanfordnlp/imdb")

In [4]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
train = ds['train'].to_pandas()
test = ds['test'].to_pandas()
print(train.shape, test.shape)

(25000, 2) (25000, 2)


In [6]:
train.head(10)

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
5,I would put this at the top of my list of film...,0
6,Whoever wrote the screenplay for this movie ob...,0
7,"When I first saw a glimpse of this movie, I qu...",0
8,"Who are these ""They""- the actors? the filmmake...",0
9,This is said to be a personal film for Peter B...,0


### Модель 1: Naive bayes + vectorizer

In [8]:
vectorizer = CountVectorizer()
# vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train['text'])
X_test = vectorizer.transform(test['text'])

y_train = train['label']
y_test = test['label']

In [9]:
len(vectorizer.vocabulary_)

74849

In [10]:
X_train.shape

(25000, 74849)

In [11]:
y_train.shape

(25000,)

In [12]:
def randomized_cv(model, param_grid, x_train, y_train):
    grid_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy', n_iter=10)
    grid_search.fit(x_train, y_train)
    print('model {} best accuracy score is {}'.format(model.__class__.__name__, grid_search.best_score_))
    return grid_search.best_estimator_

In [13]:
param_grid = {'alpha':[0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.5, 2, 5]}
model = MultinomialNB()
best_model = randomized_cv(model, param_grid, X_train, y_train)

model MultinomialNB best accuracy score is 0.7990400000000001


In [14]:
y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.88      0.83     12500
           1       0.86      0.76      0.81     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



### Модель 2: doc2vec з gensimа

In [16]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+')

train['tokenized'] = train['text'].apply(lambda x: tokenizer.tokenize(x.lower()))
test['tokenized'] = test['text'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [17]:
train['tokenized'].head()

0    [i, rented, i, am, curious, -, yellow, from, m...
1    [", i, am, curious, :, yellow, ", is, a, risib...
2    [if, only, to, avoid, making, this, type, of, ...
3    [this, film, was, probably, inspired, by, goda...
4    [oh, ,, brother, ..., after, hearing, about, t...
Name: tokenized, dtype: object

In [18]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_data = [TaggedDocument(words=row['tokenized'], tags=[str(index)]) for index, row in train.iterrows()]

In [20]:
model = Doc2Vec(tagged_data, vector_size=50, window=2, min_count=1, epochs=20)

In [21]:
train['doc_vector'] = train['tokenized'].apply(lambda x: model.infer_vector(x))

In [22]:
test['doc_vector'] = test['tokenized'].apply(lambda x: model.infer_vector(x))

In [23]:
X_train = list(train['doc_vector'])
y_train = train['label']
X_test = list(test['doc_vector'])
y_test = test['label']

clf = LogisticRegression()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))

Accuracy: 76.70%
              precision    recall  f1-score   support

           0       0.74      0.82      0.78     12500
           1       0.80      0.71      0.75     12500

    accuracy                           0.77     25000
   macro avg       0.77      0.77      0.77     25000
weighted avg       0.77      0.77      0.77     25000

