In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

import time

import nltk
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
from nltk.stem import PorterStemmer

from dataclasses import dataclass

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import log_loss, accuracy_score, top_k_accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

In [None]:
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

Train data shape: (68370, 3)
Test data shape: (33506, 2)


In [None]:
print(train_data[pd.isna(train_data["lib"])])

                                                title  lib  id
68369  How to convert some character into five digit   NaN NaN


In [None]:
train_data = train_data.dropna()

In [None]:
# libs, counts = np.unique(train_data["lib"].tolist(), return_counts=True)
# plt.figure(figsize=(16, 10))
# plt.bar(libs, counts)
# Тут смотрим на гистограмму

In [None]:
# 0. Предобработка данных

In [None]:
data_x, data_y = np.array(train_data.loc[:, "title"]), np.array(train_data.loc[:, "lib"])
data_test_x = np.array(test_data.loc[:, "title"])

In [None]:
labels = np.unique(data_y)
label_to_idx = {label: i for i, label in enumerate(labels)}
n_classes = len(labels)
print("Labels:")
print(labels, n_classes, '\n')

Labels:
['collections' 'csv' 'datetime' 'django' 'flask' 'functools' 'itertools'
 'json' 'math' 'matplotlib' 'numpy' 'os' 'pandas' 'random' 're' 'requests'
 'scipy' 'selenium' 'sklearn' 'subprocess' 'sys' 'tensorflow' 'time'
 'urllib'] 24 



In [None]:
def preprocess(titles_x, labels_y):
    X, y = [], []
    stemmer = PorterStemmer()
    stopWords = set(stopwords.words('english')) | set(["'", "\"", ",", ".", "?", "!"])
    for title, label in zip(titles_x, labels_y):
        tokens = wordpunct_tokenize(title.lower())
        tokens = list(map(stemmer.stem, tokens))
        proc_title = " ".join([token for token in tokens if token not in stopWords]).replace("&", "and")
        X.append(proc_title)
        y.append(label_to_idx[label])
    return np.array(X), np.array(y)

In [None]:
data_x, data_y = preprocess(data_x, data_y)

  0%|          | 0/68369 [00:00<?, ?it/s]

In [None]:
print(list(zip(data_x[:5], data_y[:5])))

[('way sort string alphabet order capit letter first', 5), ('maintain histori soft delet row tabl', 3), ('wave string analysi', 10), ('regular express find word part larger phrase', 14), ('textblob - loop articl calcul polar and subject score', 12)]


In [None]:
# Для данной задачи пунктуация не важна, ровно как и эмоциональная окраска (которая может выражаться через "?" и "!")

In [None]:
X_train, X_val, y_train, y_val = train_test_split(data_x, data_y)
print("X_train shape: ", X_train.shape, "y_train shape: ", y_train.shape)

X_train shape:  (51276,) y_train shape:  (51276,)


In [None]:
# 1. Бейзлайн
# Пусть это будет простая логистическая регрессия с *tf-idf* фичами

In [None]:
pipeline = Pipeline(
    [('vectorize', CountVectorizer()), ('transform', TfidfTransformer()), ('logregression', LogisticRegression(max_iter=1000))]
)

param_grid = {
    "logregression__C": [0.01, 0.025, 0.05, 0.1, 1, 10],
}

search_cv = GridSearchCV(pipeline, param_grid, verbose=3)
search_cv.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END .............logregression__C=0.01;, score=0.369 total time=   8.5s
[CV 2/5] END .............logregression__C=0.01;, score=0.368 total time=   7.9s
[CV 3/5] END .............logregression__C=0.01;, score=0.365 total time=   5.8s
[CV 4/5] END .............logregression__C=0.01;, score=0.362 total time=  11.6s
[CV 5/5] END .............logregression__C=0.01;, score=0.364 total time=   8.8s
[CV 1/5] END ............logregression__C=0.025;, score=0.477 total time=   8.1s
[CV 2/5] END ............logregression__C=0.025;, score=0.475 total time=   9.3s
[CV 3/5] END ............logregression__C=0.025;, score=0.473 total time=  12.8s
[CV 4/5] END ............logregression__C=0.025;, score=0.470 total time=   9.1s
[CV 5/5] END ............logregression__C=0.025;, score=0.474 total time=   8.2s
[CV 1/5] END .............logregression__C=0.05;, score=0.528 total time=  12.0s
[CV 2/5] END .............logregression__C=0.05;,

In [None]:
best_pipeline = search_cv.best_estimator_
y_pred = best_pipeline.predict(X_val)
y_pred_probas = best_pipeline.predict_proba(X_val)
print("First predictions: ", y_pred[:5])

First predictions:  [17 21 10  9 12]


In [None]:
print("Best hyperparams: ", search_cv.best_params_)

Best hyperparams:  {'logregression__C': 1}


In [None]:
print(f"Log loss: {log_loss(y_val, y_pred_probas)}")
print(f"Top-1 accuracy: {accuracy_score(y_val, y_pred)}")
print(f"Top-3 accuracy: {top_k_accuracy_score(y_val, y_pred_probas, k=3)}")

Log loss: 1.4350003708571442
Top-1 accuracy: 0.6050430000585035
Top-3 accuracy: 0.8165330837184812


In [None]:
# Время предсказывать для теста

In [None]:
def preprocess_features(titles_x):
    X = []
    stemmer = PorterStemmer()
    stopWords = set(stopwords.words('english')) | set(["'", "\"", ",", ".", "?", "!"])
    for title in titles_x:
        tokens = wordpunct_tokenize(title.lower())
        tokens = list(map(stemmer.stem, tokens))
        proc_title = " ".join([token for token in tokens if token not in stopWords]).replace("&", "and")
        X.append(proc_title)
    return np.array(X)

In [None]:
data_test_x = preprocess_features(data_test_x)

In [None]:
test_predictions = best_pipeline.predict(data_test_x)

In [None]:
test_data["lib"] = np.array([labels[i] for i in test_predictions])

In [None]:
test_data.to_csv("./data/submission.csv")

In [None]:
# Были попытки взять предобученную Roberta-base с huggingface в качестве фича-экстрактора, но они,
# видимо, из-за недостаточно хорошей архитектуры моей модели, дали худшее качество