# Домашнее задание №2

### Подготовка виртуального окружения Python

In [37]:
import sys
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])


0

### Генерация тренировочного датасета

In [38]:
import time
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import RegexpTokenizer

In [39]:
class ExtractMethods:
    TF_IDF = 'tf-idf'
    TF_IDF_NGRAM = 'tf-idf_ngram'
    BAG_OF_WORDS = 'bag_of_words'
    BAG_OF_CHAR = 'bag_of_characters'


class FeatureExtractor:
  def __init__(self, method, *args, **kwargs):
    self.args = args
    self.kwargs = kwargs
    self.notes = {}
    self.vectorizer = self._get_vectorizer(method)

  def _get_vectorizer(self, method):
    token = RegexpTokenizer(r'[a-zA-Z0-9]+')
    match method:
      case ExtractMethods.TF_IDF:
          return TfidfVectorizer(tokenizer=token.tokenize, lowercase=True, **self.kwargs)
      case ExtractMethods.TF_IDF_NGRAM:
          return TfidfVectorizer(lowercase=True, analyzer='char', ngram_range=(1, 3), **self.kwargs)
      case ExtractMethods.BAG_OF_WORDS:
          return CountVectorizer(analyzer='word', tokenizer=token.tokenize, lowercase=True, **self.kwargs)
      case ExtractMethods.BAG_OF_CHAR:
          return CountVectorizer(analyzer='char', lowercase=True, **self.kwargs)
      case _:
          raise ValueError(f"Unknown feature extraction method: {method}")

  def fit_extract(self, x_train):
      return self.vectorizer.fit_transform(x_train)

  def extract_features(self, x):
      return self.vectorizer.transform(x)    

In [40]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

In [41]:
class FeatureSelecter:
    def __init__(self, n_components):
        self._scaler = StandardScaler(with_mean=False)
        self._lasso_selecter = SelectFromModel(Lasso(alpha=0.001, random_state=10))
        self._pca = TruncatedSVD(n_components=n_components)

    def _to_dense(self, X):
        return X.toarray() if hasattr(X, 'toarray') else X

    def fit_transform(self, x_train, y_train):
        self._scaler.fit(x_train)
        Xs = self._scaler.transform(x_train)
        Xs_dense = self._to_dense(Xs)
        self._lasso_selecter.fit(Xs_dense, y_train)
        Xsel = self._lasso_selecter.transform(Xs_dense)
        self._pca.fit(Xsel)
        return self._pca.transform(Xsel)
    
    def transform(self, x):
        Xs = self._scaler.transform(x)
        Xs_dense = self._to_dense(Xs)
        Xsel = self._lasso_selecter.transform(Xs_dense)
        return self._pca.transform(Xsel)

In [None]:
import xgboost as xgb
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
class Models:
    XGBOOST = 'XGBoost'
    SVC = 'SVC'
    NU_SVC = 'NuSVC'
    KNEIGHBORS = 'KNeighbors'
    DECISION_TREE = 'DecisionTree'
    RANDOM_FOREST = 'RandomForest'
    ADABOOST = 'AdaBoost'
    BAGGING = 'Bagging'
    EXTRA_TREES = 'ExtraTrees'
    LINEAR_SVC = 'LinearSVC'


class ModelFabric:
    @staticmethod
    def create_model(model_name, *args, **kwargs):
        match model_name:
            case Models.XGBOOST:
                return xgb.XGBClassifier(*args, **kwargs)
            case Models.SVC:
                return SVC(gamma=2, C=1, *args, **kwargs)
            case Models.NU_SVC:
                return NuSVC(*args, **kwargs)
            case Models.KNEIGHBORS:
                return KNeighborsClassifier(3, *args, **kwargs)
            case Models.DECISION_TREE:
                return DecisionTreeClassifier(max_depth=5, *args, **kwargs)
            case Models.RANDOM_FOREST:
                return RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, *args, **kwargs)
            case Models.ADABOOST:
                return AdaBoostClassifier( *args, **kwargs)
            case Models.BAGGING:
                return BaggingClassifier(*args, **kwargs)
            case Models.EXTRA_TREES:
                return ExtraTreesClassifier(*args, **kwargs)
            case Models.LINEAR_SVC:
                return LinearSVC(*args, **kwargs)
            case _:
                raise ValueError(f"Unsupported model name: {model_name}")

In [44]:
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.metrics import accuracy_score, confusion_matrix

def plot_confusion_matrix(y_expect, y_pred):
    cm = confusion_matrix(y_expect, y_pred)
    sns.heatmap(cm, annot=True, fmt='g', xticklabels=['Normal','SQL-Injection'], yticklabels=['Normal','SQL-Injection'])
    plt.xlabel('Prediction',fontsize=13)
    plt.ylabel('Actual',fontsize=13)
    plt.title('Confusion Matrix',fontsize=17)
    plt.show()

In [45]:
source_df = pd.read_csv(TRAIN_FILE, sep='\t', engine='python')
df_train, df_test = train_test_split(source_df, test_size=0.3, stratify=source_df['label'], random_state=42)
x_train, y_train = df_train['payload'].values, df_train['label'].values
x_test, y_test = df_test['payload'].values, df_test['label'].values

NameError: name 'TRAIN_FILE' is not defined

In [None]:
# Параметры задания
STUDENT_NAME = "<Введите_Фамилию_Латиницей>"  # например, "Ivanov"
TRAIN_FILE = f"{STUDENT_NAME}_dataset.tsv"
VALIDATE_FILE = "validate.tsv"
SUBMISSION_FILE = f"submission_{STUDENT_NAME}.csv"

# Проверим наличие тренировочного датасета; если его нет — создайте через create_task_dataset.py
from pathlib import Path
if not Path(TRAIN_FILE).exists():
    raise FileNotFoundError(
        f"Файл {TRAIN_FILE} не найден. Сначала выполните: python3 create_task_dataset.py --student_name {STUDENT_NAME} --file malicious.tsv"
    )


In [None]:
# Эксперименты: различные способы извлечения признаков и модели
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np

extract_setups = [
    (ExtractMethods.TF_IDF, {"lowercase": True}),
    (ExtractMethods.TF_IDF_NGRAM, {}),
    (ExtractMethods.BAG_OF_WORDS, {"lowercase": True}),
]

model_setups = [
    Models.LINEAR_SVC,
    Models.RANDOM_FOREST,
    Models.XGBOOST,
]

results = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for extract_method, kwargs in extract_setups:
    fe = FeatureExtractor(extract_method, **kwargs)
    Xtr = fe.fit_extract(x_train)
    selector = FeatureSelecter(n_components=min(100, Xtr.shape[1]-1) if Xtr.shape[1] > 1 else 1)
    Xtr_sel = selector.fit_transform(Xtr, y_train)

    for model_name in model_setups:
        model = ModelFabric.create_model(model_name)
        scores = cross_val_score(model, Xtr_sel, y_train, cv=skf, scoring="accuracy", n_jobs=None)
        results.append({
            "extract": extract_method,
            "model": model_name,
            "cv_mean": float(np.mean(scores)),
            "cv_std": float(np.std(scores))
        })

pd.DataFrame(results).sort_values(["cv_mean", "cv_std"], ascending=[False, True])


In [None]:
# Обучение лучшей связки на train и оценка на test + confusion matrix
# Вы можете вручную выбрать лучшую пару из таблицы выше. По умолчанию берём TF-IDF + LinearSVC
best_extract_method = ExtractMethods.TF_IDF
best_extract_kwargs = {"lowercase": True}
best_model_name = Models.LINEAR_SVC

fe_best = FeatureExtractor(best_extract_method, **best_extract_kwargs)
Xtr_full = fe_best.fit_extract(x_train)
selector_best = FeatureSelecter(n_components=min(100, Xtr_full.shape[1]-1) if Xtr_full.shape[1] > 1 else 1)
Xtr_sel_full = selector_best.fit_transform(Xtr_full, y_train)

model_best = ModelFabric.create_model(best_model_name)
model_best.fit(Xtr_sel_full, y_train)

# Оценка на holdout-тесте
Xte = fe_best.extract_features(x_test)
Xte_sel = selector_best.transform(Xte)
y_pred = model_best.predict(Xte_sel)

acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {acc:.4f}")
plot_confusion_matrix(y_test, y_pred)


In [None]:
# Обучение на всём тренировочном датасете и предсказание для validate.tsv
# Перевычим на всём source_df
fe_final = FeatureExtractor(best_extract_method, **best_extract_kwargs)
X_all = fe_final.fit_extract(source_df['payload'].values)
selector_final = FeatureSelecter(n_components=min(100, X_all.shape[1]-1) if X_all.shape[1] > 1 else 1)
X_all_sel = selector_final.fit_transform(X_all, source_df['label'].values)

final_model = ModelFabric.create_model(best_model_name)
final_model.fit(X_all_sel, source_df['label'].values)

# Предсказание
validate_df = pd.read_csv(VALIDATE_FILE, sep='\t', engine='python')
X_val = fe_final.extract_features(validate_df['payload'].values)
X_val_sel = selector_final.transform(X_val)
val_pred = final_model.predict(X_val_sel)

# Сохранение сабмишена
submission = pd.DataFrame({
    'ID': validate_df['ID'],
    'TARGET': val_pred
})
submission.to_csv(SUBMISSION_FILE, index=False)
print(f"Saved submission to: {SUBMISSION_FILE}")


In [None]:
# Базовое исследование данных
display(source_df.head())
print(source_df.info())
print(source_df.describe(include='all'))
print("Пропуски по столбцам:\n", source_df.isna().sum())
