In [1]:
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
def fetch_reviews(path):
  data = []
  files = [f for f in os.listdir(path)]
  for file in files:
    with open(path+file, "r", encoding='utf8') as f:
      data.append(f.read())
  return data

In [3]:
df_train_pos = pd.DataFrame({'review': fetch_reviews('aclImdb/train/pos/'), 'label': 1})
df_train_neg = pd.DataFrame({'review': fetch_reviews('aclImdb/train/neg/'), 'label': 0})

df_test_pos = pd.DataFrame({'review': fetch_reviews('aclImdb/test/pos/'), 'label': 1})
df_test_neg = pd.DataFrame({'review': fetch_reviews('aclImdb/test/neg/'), 'label': 0})

df = pd.concat([df_train_pos, df_train_neg, df_test_pos, df_test_neg], ignore_index=True)

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = stopwords.words('english')
stop_words.remove('not')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def data_preprocessing(review):
  review = re.sub(re.compile('<.*?>'), '', review)
  review =  re.sub('[^A-Za-z0-9]+', ' ', review)
  review = review.lower()
  tokens = nltk.word_tokenize(review)
  review = [word for word in tokens if word not in stop_words]
  review = [lemmatizer.lemmatize(word) for word in review]
  review = ' '.join(review)
  return review

In [5]:
df['preprocessed_review'] = df['review'].apply(lambda review: data_preprocessing(review))

data = df.copy()
y = data['label'].values
data.drop(['label'], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, stratify=y)

vectorizer = TfidfVectorizer(min_df=10) # Используем только один векторизатор
X_train_review_tfidf = vectorizer.fit_transform(X_train['preprocessed_review'])
X_test_review_tfidf = vectorizer.transform(X_test['preprocessed_review'])

In [6]:
clf = LogisticRegression(penalty='l2')
clf.fit(X_train_review_tfidf, y_train)

In [7]:
# Новый комментарий
new_review = ["This is the terrible movie"]

# Преобразование в числовой формат с тем же векторизатором
new_review_vectorized = vectorizer.transform(new_review)

# Предсказание
prediction = clf.predict(new_review_vectorized)

# Печатаем результат с текстовой меткой
if prediction[0] == 1:
    print("Positive comment")
else:
    print("Negative comment")

Negative comment


In [8]:
import pickle

In [9]:
# Экспортируем модель и векторайзер
with open('model.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Модель и векторайзер успешно сохранены!")

Модель и векторайзер успешно сохранены!
