# Необходимые библиотеки

In [0]:
import numpy as np
import pandas as pd
from google.colab import files
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy.stats import kurtosis
from scipy.stats import skew
from scipy.stats import shapiro
from scipy.stats import normaltest
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,auc, f1_score, confusion_matrix,precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
%matplotlib inline


## Вспомогательные функции

In [0]:
## Просмотр данных
def prosmotr(data):
  pd.set_option('display.max_columns', 100) #Размеры таблицы
  pd.set_option('display.max_rows', 100)
  pd.set_option('precision', 2) #Регулируем количество знаков после запятой:
  print('~~~~Содержание данных~~~~\n', data.head())
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  print('~~~Размеры данных~~~\n', data.shape)
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  print('~~~Названия колонок~~~\n', data.columns)
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  print('~~~Информация о данных~~~\n')
  print(data.info())
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  print('~~~Наличие пропусков в данных~~~\n', data.isna().sum())
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  print('~~~Количество типов в данных~~~')
  print(data.dtypes.value_counts())
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  kateg = list(data.select_dtypes(include=['object']).columns) # Делаем список категориальных данных
  print('~~~Категориальные данные~~~~')
  print(kateg)
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  chislov_float = list(data.select_dtypes(include=['float64'])) #Делаем список числовых данных float
  print('~~~Числове данные float~~~~')
  print(chislov_float)
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  chislov_int = list(data.select_dtypes(include=['int64'])) #Делаем список числовых данных int
  print('~~~Числове данные int~~~~')
  print(chislov_int)
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  print('~~~Основные статистические характеристики данных по каждому числовому признаку (типы int64)~~~\n', data.describe(include=['int64']))
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  print('~~~Основные статистические характеристики данных по каждому числовому признаку (типы float64)~~~\n', data.describe(include=['float64']))
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  print('~~~Cтатистика по нечисловым признакам object ~~~\n', data.describe(include=['object']))
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
  print('~~~Cтатистика по нечисловым признакам bool ~~~\n', data.describe(include=['bool']))
  print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

In [0]:
## Анализ данных
def analyze(data):
  num = data.columns
  for i in num:
    print(i.title())
    print('~~~~~~~~~~~~~~~~~~~~~~~~~\n')
    print("mean : ", np.mean(data[i]))
    print("var  : ", np.var(data[i]))
    print("skew : ", skew(data[i]))
    print("kurt : ", kurtosis(data[i]))
    print("shapiro : ", shapiro(data[i]))
    print("normaltest : ", normaltest(data[i]))
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('~~~~~~~~~~~~~~~~~~~~~~~~~\n')

In [0]:
## метрика

from sklearn.metrics import accuracy_score,auc, f1_score, confusion_matrix,precision_score, recall_score, roc_auc_score, roc_curve

def value_of_metrics(y_true, y_pred):
    print('Accuracy: ', accuracy_score(y_true, y_pred))
    print('Recall: ', recall_score(y_true, y_pred))
    print('Precision: ', precision_score(y_true, y_pred))  
    print('F1: ', f1_score(y_true, y_pred))
    print('Roc_AUC: ', roc_auc_score(y_true, y_pred))
    print('Confusion Matrix: ')
    print(pd.DataFrame(confusion_matrix(y_true, y_pred)))
    
    fpr, tpr, threshold = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.title('My DataSet')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [0]:
# убираем лишее
def unpack_X_and_y(data, col):
  return data.drop([col],  axis=1), data[col]

In [0]:
# поиск лучших фич и удаляем лишнее
def best_features(X, y):
  #выбираем топ 10 фич
  bestfeatures = SelectKBest(score_func=chi2, k=10)
  fit = bestfeatures.fit(X,y)
  dfscores = pd.DataFrame(fit.scores_)
  dfcolumns = pd.DataFrame(X.columns)
  featureScores = pd.concat([dfcolumns,dfscores],axis=1)
  featureScores.columns = ['Specs','Score']  
  return list(featureScores.nlargest(10,'Score')['Specs'])   

## По алгоритмам

In [0]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV


def fit_model(model, X, y, parameters):  
  cross_validation = StratifiedKFold(n_splits=5)

  grid_search = GridSearchCV(model,
                              scoring='accuracy',
                              param_grid=parameters,
                              cv=cross_validation,
                              verbose=1
                            )

  grid_search.fit(X, y)
  parameters=grid_search.best_params_
  print('Best score: {}'.format(grid_search.best_score_))
  print('Best parameters: {}'.format(parameters))

  return grid_search

## KNN

In [0]:
from sklearn.neighbors import KNeighborsClassifier

def KNN(train, targets):
  parameter_grid = {
                 'n_neighbors': [2, 5, 10, 15, 20, 25],
                 'metric': ['chebyshev', 'manhattan', 'euclidean', 'minkowski'],
                 'algorithm': ['ball_tree', 'kd_tree', 'brute']
                 }
  knn = KNeighborsClassifier()
  knn_trained = fit_model(knn, train, targets, parameter_grid)

  return knn_trained

## BAYES

In [0]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB


def Bern(train, targets):
  parameter_grid = {
                 'alpha': [0.001, 0.01, 0.1, 0.2, 0.5, 1.0],
                 'binarize': [0.0, 0.2, 0.5],
                 'fit_prior': ['True', 'False']
                 }
  bernoulli = BernoulliNB()
  trained_bernoulli = fit_model(bernoulli, train, targets, parameter_grid)
  return trained_bernoulli

def Gaus(train, targets):
  gaussian_nb = GaussianNB()
  parameter_grid = {
                 'var_smoothing': [1e-09, 1e-10, 1e-11, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
                 }
  trained_gaussian_nb = fit_model(gaussian_nb, train, targets, parameter_grid)
  return trained_gaussian_nb

def Multi_nb(train, targets):
  multi_nb = MultinomialNB()
  parameter_grid = {
                 'alpha': [0.001, 0.01, 0.1, 0.2, 0.5, 1.0],
                 'fit_prior': ['True', 'False']
                 }
  trained_multi_nb = fit_model(multi_nb, train, targets, parameter_grid)
  return trained_multi_nb

def Complement_nb(train, targets):
  complement_nb = ComplementNB()
  parameter_grid = {
                 'alpha': [0.001, 0.01, 0.1, 0.2, 0.5, 1.0],
                 'fit_prior': ['True', 'False'],
                 'norm': ['True', 'False']
                 }
  trained_complement_nb = fit_model(complement_nb, train, targets, parameter_grid)
  return trained_complement_nb


## SVM

In [0]:
from sklearn import svm

def SVM(train, targets):
    Svm = svm.SVC()
    parameter_grid = {
        "C": [1.0, 1.5, 2.0, 5.0],
        "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
        "shrinking": [True, False],
        "decision_function_shape": ["ovr", "ovo"],
        "max_iter": [-1, 500, 1000]
    }
    train_Svm = fit_model(Svm, train, targets,parameter_grid)
    return train_Svm
 

## RANDOM FOREST

In [0]:
from sklearn.ensemble import RandomForestClassifier

def Random(X, y, scoring='recall', cv=5):
  parameter_grid = {
                  "n_estimators": [10, 20, 30, 50, 100],
                  "criterion": ["gini","entropy"],
                  "max_depth": [2, 3, 4, 5, None],
                  "min_samples_split": [2, 3, 4, 5],
                  "min_samples_split": [2, 3, 4, 5]
                 }
  classifier = RandomForestClassifier()
  
  grid = GridSearchCV(estimator=classifier,
                         param_grid=parameter_grid,
                         scoring=scoring,
                         cv=cv,
                         n_jobs=-1)
  
  grid.fit(X, y)
  
  return grid

## Варианты по алгоритмам с применением поиска лучших параметров и метрик

In [0]:
## SVM
svm_train = SVM(X_train, y_train)

In [0]:
value_of_metrics(y_test, svm_train.predict(X_test))

In [0]:
## KNN
knn_trained=KNN(X_train, y_train)

In [0]:
value_of_metrics(y_test, knn_trained.predict(X_test))

In [0]:
## BAYS
bern_train = Bern(X_train, y_train)

In [0]:
value_of_metrics(y_test,bern_train.predict(X_test))

In [0]:
## Gaus
gaus_train = Gaus(X_train, y_train)

In [0]:
value_of_metrics(y_test,gaus_train.predict(X_test))

In [0]:
## Multi
multu_train = Multi_nb(X_train, y_train)

In [0]:
value_of_metrics(y_test,multu_train.predict(X_test))

In [0]:
## Random
random_train = Random(X_train, y_train)

In [0]:
value_of_metrics(y_test,random_train.predict(X_test))