In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from bs4 import BeautifulSoup
import re

import nltk
nltk.download('stopwords')
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from copy import deepcopy
import pandas as pd

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm.auto import tqdm
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import os
from pathlib import Path

from collections import Counter
from typing import List
import string

import seaborn as sns
sns.set(palette='summer')

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
!unzip aclImdb2.zip

Archive:  aclImdb2.zip
replace __MACOSX/._aclImdb — копия 2? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [6]:
data_dir = Path('/content/aclImdb — копия 2')

In [7]:
def read_imdb(data_dir, is_train):
    data, labels = [], []
    for label in ('pos', 'neg'):
        folder_name = os.path.join(data_dir, 'train' if is_train else 'test', label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                name = os.path.basename(file)
                new_label = name[-6:-4]
                review = f.read().decode('utf-8').replace('\n', '')
                data.append(review)
                if new_label.isnumeric():
                    labels.append(int(new_label))
                else:
                    labels.append(int(new_label[-1]))
    return data, labels

train_data = read_imdb(data_dir, is_train=True)
print('trainings:', len(train_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
    print('label:', f'{y},', 'text:', x[:])

trainings: 25000
label: 7, text: This review owes its existence entirely to a review. We take a weekly TV magazine to see what is coming up, and duly decide what we will watch. Obligingly, there are brief reviews of most of the films scheduled to be shown on the five major terrestrial channels. In addition to the prose, each film is allocated a 1-5 star rating. 5 means Don't Miss (superior to 4 for Excellent!), down to 1 standing for Poor. We have learned from vast experience that, with few exceptions, stars are awarded for gross taste, foul language, offensive content, promiscuity, horror, blood & guts, and especially killing off the hero/heroine just when everyone was about to live happily ever after. (If that isn't done, the movie is denigrated as being 'predictable' - the worst insult imaginable!)<br /><br />Brave New Girl was given only 1 star, thereby suggesting it was a candidate worthy of our time and attention. This was confirmed by the reviewer's description of the movie as b

In [8]:
test_data = read_imdb(data_dir, is_train=False)
print('len:', len(test_data[0]))
for x, y in zip(test_data[0][:3], test_data[1][:3]):
    print('label:', f'{y},', 'text:', x[:])

len: 25000
label: 10, text: This amusing, sometimes poignant look at the Hollywood detective genre of the 1940's and 1950's stars Robert Sacci as an unnamed former cop who retires, uses his life savings to pay for plastic surgery to transform his image into that of his idol, Humphrey Bogart, then sets up shop as a private eye under the name "Sam Marlowe". Robert Sacchi, incidentally, is one of the rare few Bogart impersonators who got the lisp exactly right; more to the point, the body and facial language are there. For awhile, "Sam"'s only client is his landlady, who wants him to find her undersized boyfriend, and his only conversational foil is his secretary, simply called "Dutchess" (Misty Rowe), who in his own words, "looked like Marilyn Monroe and made about as much sense as Gracie Allen", and has a passion for banana splits. Then he encounters Elsa (Olivia Hussey), the plain, sweet, virginal daughter of a retired props-master who has been murdered for no discernible reason. In th

In [9]:
def read_unsup(data_dir):
    data = []
    folder_name = os.path.join(data_dir, 'train', 'unsup')
    for file in os.listdir(folder_name):
        with open(os.path.join(folder_name, file), 'rb') as f:
            review = f.read().decode('utf-8').replace('\n', '')
            data.append(review)
    return data

In [10]:
unsup_data = read_unsup(data_dir)
print('length:', len(unsup_data))
for x in unsup_data[:3]:
    print('text:', x[:])

length: 50000
text: Maybe other people appreciate this, but if you are a fan of Zucker Abram Zucker type films (airplane etc.) or Ben Stiller movies, you won't find much to laugh about in this film. I didn't find anything save for (spoiler) the finger cutting off scene at the end, which is a twist on the way movies usually do things. Even that was not laugh out loud funny, more funny in the sense of "We are in on QT's joke about the film industry". Big Deal.<br /><br />This movie is worth it's 5.9 rating, i.e. it is below average. Don't bother. It must have gotten that rating for the elements of drama that it had, it seems that great comedies such as Austin Powers series etc. can't push more than mid 7s without having an element of "social significance" or drama as Dr Strangelove might have.
text: this isn't crouching tiger, the medevil matrix, or anything else it is being touted as. i guess not being a idiot frat boy i dont have the appreciation for this film that many others do. if y

In [11]:
def text_cleaning(data):
    clean_text = []
    for text in data:
        text = BeautifulSoup(text, "lxml").get_text()   #remove html tags
        text = re.sub("[^a-zA-Z]", " ", text)
        text = text.lower().split()
        stop_word_list = set(stopwords.words("english"))
        text = [word for word in text if not word in stop_word_list]
        clean_text.append((" ".join(text)))
    return clean_text

In [12]:
clean_text_train = text_cleaning(train_data[0])
train_label = train_data[1]

  text = BeautifulSoup(text, "lxml").get_text()   #remove html tags


In [13]:
clean_text_unsup = text_cleaning(unsup_data)
nolabel = [-1 for _ in range(len(clean_text_unsup))]

  text = BeautifulSoup(text, "lxml").get_text()   #remove html tags


In [14]:
clean_text_test = text_cleaning(test_data[0])
test_label = test_data[1]

  text = BeautifulSoup(text, "lxml").get_text()   #remove html tags


In [15]:
clean_text_train[0]

'review owes existence entirely review take weekly tv magazine see coming duly decide watch obligingly brief reviews films scheduled shown five major terrestrial channels addition prose film allocated star rating means miss superior excellent standing poor learned vast experience exceptions stars awarded gross taste foul language offensive content promiscuity horror blood guts especially killing hero heroine everyone live happily ever done movie denigrated predictable worst insult imaginable brave new girl given star thereby suggesting candidate worthy time attention confirmed reviewer description movie truly awful tale stupid stupid watched wife glad tv magazine reviewer stated movie reworking war peace agree reading imdb reviews title day two later urge pick pen speak add halfpennyworth pronounced harf pen uth emphasis first syllable uninitiated became overwhelming take movie well matter taste like attractive characters believable relationships interesting situations courtesy respect

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
train_data_features = (vectorizer.fit_transform(clean_text_train)).toarray()

test_data_features = (vectorizer.transform(clean_text_test)).toarray()

print(train_data_features.shape)
train_data_features

(25000, 5000)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
vectorizer_unsup = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
unsup_data_features = (vectorizer_unsup.fit_transform(clean_text_unsup)).toarray()

In [18]:
train_mixed = np.concatenate((train_data_features, unsup_data_features[:25000]))
label_train_mixed = np.concatenate((train_label, nolabel[:25000]))

In [19]:
!pip install catboost



In [20]:
import catboost

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.semi_supervised import LabelPropagation
model = LabelPropagation()


model.fit(train_mixed, label_train_mixed)
tran_labels = model.transduction_

In [None]:
boosting_model = catboost.CatBoostClassifier(iterations=1600, learning_rate=0.03, depth=2, l2_leaf_reg=1, loss_function='MultiClass')

boosting_model.fit(train_mixed, tran_labels)

preds = boosting_model.predict(clean_text_test)

In [None]:
score = accuracy_score(test_label, preds)
print('Точность измерений: %.3f' % (score * 100))

In [None]:
test_auc = roc_auc_score(test_label, preds)
print(f'test auc: {test_auc}')

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(test_label, preds, zero_division=0))

              precision    recall  f1-score   support

           1       0.47      0.83      0.60      5022
           2       0.25      0.00      0.01      2302
           3       0.25      0.02      0.03      2541
           4       0.32      0.26      0.29      2635
           7       0.34      0.17      0.23      2307
           8       0.27      0.16      0.20      2850
           9       0.67      0.00      0.00      2344
          10       0.39      0.86      0.54      4999

    accuracy                           0.40     25000
   macro avg       0.37      0.29      0.24     25000
weighted avg       0.38      0.40      0.31     25000

