In [None]:
!pip install dill

In [None]:
import pandas as pd
import numpy as np
import dill
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.metrics import f1_score
#working with text
from sklearn.feature_extraction.text import TfidfVectorizer
#normalizing data
from sklearn.preprocessing import StandardScaler
#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.metrics import precision_score,recall_score
#imputer
from sklearn.impute import SimpleImputer
import sklearn.datasets

In [None]:
!wget -O train.csv "https://drive.google.com/uc?id=1oPtTtVbkSEdiNwjcSHeEkv_C9g0yfLVO&export=download"

In [None]:
df = pd.read_csv('train.csv', encoding='utf-8').fillna(' ').sample(frac=1)
df.head(3)

In [None]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
df['y'] = df[class_names].max(axis=1).values
df['y'].value_counts()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, 
                                                    df['toxic'], test_size=0.33, random_state=42)
#save test
X_test.to_csv("/content/drive/MyDrive/X_test.csv", index=None)
y_test.to_csv("/content/drive/MyDrive/y_test.csv", index=None)
#save train
X_train.to_csv("/content/drive/MyDrive/X_train.csv", index=None)
y_train.to_csv("/content/drive/MyDrive/y_train.csv", index=None)

In [None]:
features = ['id', 'comment_text', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
target = 'toxic'

In [None]:
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, col, default_text=''):
        self.col = col
        self.default_text = default_text
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X[self.col] = X[self.col].fillna(self.default_text)
        return X

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.key]

In [None]:
# объединяем pipeline для текста
text_pipeline = Pipeline([
                ('imputer', TextImputer('comment_text', '')),
                ('selector', ColumnSelector(key='comment_text')),
                ('tfidf', TfidfVectorizer(max_df=0.9, min_df=10))
            ])

# создаем pipeline для всех признаков
features = FeatureUnion([('text_pipeline', text_pipeline)])

# объединяем pipeline для признаков и целевой переменной
pipeline = Pipeline([
                ('features', features),
                ('classifier', LogisticRegression())
            ])
pipeline.fit(X_train, y_train)

In [None]:
pipeline.steps

In [None]:
# Сохраняем модель в файл
with open('/content/drive/MyDrive/logreg_pipeline.dill', 'wb') as f:
    dill.dump(pipeline, f)

# Проверка работоспособности и качества пайплайна

In [None]:
dill._dill._reverse_typemap['ClassType'] = type

In [None]:
X_test = pd.read_csv("/content/drive/MyDrive/X_test.csv")
y_test = pd.read_csv("/content/drive/MyDrive/y_test.csv")

In [None]:
with open('/content/drive/MyDrive/logreg_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [None]:
predictions = pipeline.predict_proba(X_test)[:, 1]
pd.DataFrame({'preds': predictions}).to_csv("test_predictions.csv", index=None)

In [None]:
roc_auc_score(y_test, predictions)