In [12]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
import keras
import tensorflow as tf
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, f1_score
from sklearn import metrics
import os
from datetime import datetime

In [None]:
class BuildingModel: # Class BuildingModel is for quick auto-train neural network with mine architecture 
    def __init__(self, data, threshold, test_size, train_test_rand_seed, f1_lim,columns_to_drop,target_name,what_to_detect): # defining variables
        self.data = data
        self.threshold = threshold
        self.test_size = test_size
        self.train_test_rand_seed = train_test_rand_seed
        self.f1_lim = f1_lim
        self.columns_to_drop = columns_to_drop
        self.target_name = target_name
        self.what_to_detect = what_to_detect

    def preprocess_data(self): # data preprocessing, opening .csv, dropping specific columns, finding the most corelatted columns, splitting for test and train set
        df = pd.read_csv(self.data) 
        df = df.drop(self.columns_to_drop, axis=1)
        corr_matrix = df.corr()
        ttl = df[self.target_name]
        corr_type = corr_matrix[self.target_name]
        corr_type.sort_values(ascending=False)
        pos = corr_type.sort_values(ascending=False)
        columns = []
        for i in range(0, len(pos)):
            if abs(pos[i]) > self.threshold and abs(pos[i]) < 0.99:
                columns.append(pos.index[i])
        result_s = df.loc[:, df.columns.intersection(columns)]
        X_train, X_test, y_train, y_test = train_test_split(result_s, ttl, test_size=self.test_size, stratify=ttl,
                                                            random_state=self.train_test_rand_seed)
        return X_train, X_test, y_train, y_test
    
    def get_current_time(self): # taking a current time to use it in folder name 
        now = datetime.now()
        current_time = now.strftime("%H-%M-%S")
        return current_time


    def train_and_save_models(self, X_train, X_test, y_train, y_test): # training model and saving it in a folder with ROC Curve and Confusion matrix and train and test set
        for j in range(2, 5):
            for i in range(1, 40):
                model_nn = Sequential()
                model_nn.add(Dense(300, activation='relu', input_dim=X_train.shape[1]))
                model_nn.add(Dense(250, activation='relu'))
                model_nn.add(Dense(100, activation='relu'))
                model_nn.add(Dense(50, activation='relu'))
                model_nn.add(Dense(125, activation='relu'))
                model_nn.add(Dense(375, activation='relu'))
                model_nn.add(Dense(400, activation='relu'))
                model_nn.add(Dense(500, activation='relu'))
                model_nn.add(Dense(1, activation='sigmoid'))
                model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
                model_nn.fit(X_train, y_train, epochs=i * 20, batch_size=2 ** j)
                predictions = model_nn.predict(X_test)
                predicted = []
                for p in range(0, len(predictions)): # sigmoid threshold setting as 0.5
                    if predictions[p][0] > 0.5:
                        predicted.append(1)
                    else:
                        predicted.append(0)
                real_pred = pd.DataFrame({'test_data': y_test, 'predicted_data': predicted})
                test_values = real_pred['test_data'].tolist()
                prediction_values = real_pred['predicted_data'].tolist()
                f1 = f1_score(test_values, prediction_values)
                if f1 > self.f1_lim:
                    current_time = self.get_current_time()
                    folder_name = f'model_time_{current_time}__f1_{f1}'

                    os.makedirs(folder_name)
                    folder_path = os.path.join(os.getcwd(), folder_name)
                    model_nn.save(os.path.join(folder_path,
                                               f'model_ep_{i * 20}_bt_{2 ** j}_f1_{f1}_threshold_{self.threshold}_'
                                               f'columns_{X_train.shape[1]}_random_state_{self.train_test_rand_seed}.h5'))
                    print("model saved")
                    X_train.to_csv(os.path.join(folder_path, 'X_train.csv'))
                    X_test.to_csv(os.path.join(folder_path, 'X_test.csv'))
                    y_train.to_csv(os.path.join(folder_path, 'y_train.csv'))
                    y_test.to_csv(os.path.join(folder_path, 'y_test.csv'))
                    fpr, tpr, thresholds = metrics.roc_curve(y_test, predicted)
                    auc = metrics.auc(fpr, tpr)
                    plt.figure()
                    plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % auc)
                    plt.plot([0, 1], [0, 1], 'k--')
                    plt.xlim([0.0, 1.0])
                    plt.ylim([0.0, 1.05])
                    plt.xlabel('True positive rate')
                    plt.ylabel('Fasle positive rate')
                    plt.title('ROC curve')
                    plt.legend(loc="lower right")
                    plt.savefig(os.path.join(folder_path, 'ROC_Curve.png'))
                    conf_matrix = confusion_matrix(y_test, predicted)
                    display = ConfusionMatrixDisplay(conf_matrix, display_labels=[f'NON-{self.what_to_detect}', f'{self.what_to_detect}'])
                    display.plot(cmap='viridis')
                    ax = plt.gca()
                    ax.set(title=f'Confusion Matrix for the {self.what_to_detect} Detection Model')
                    for text in ax.texts:
                        text.set_fontsize(30)
                    plt.savefig(os.path.join(folder_path, 'Confusion_Matrix.png'))
                else:
                    continue

model = BuildingModel(data="dataframe.csv",threshold= 0.3,test_size=0.3,train_test_rand_seed=2,f1_lim=0.01,columns_to_drop=['Column0','Column1'],target_name='Target',what_to_detect='Target')
X_train, X_test, y_train, y_test = model.preprocess_data() 
model.train_and_save_models(X_train, X_test, y_train, y_test)

In [9]:
import pandas as pd
import numpy as np
#setting column names
column_names = [f'Column{i}' for i in range(3495)] 
# generating target values
target = np.random.randint(0, 2, size=(1000, 1))
# generating data
data = np.random.randint(0, 100, size=(1000, 3495))
data = data/1000
# setting datafram
df = pd.DataFrame(data, columns=column_names)
df['Target'] = target
# saving in .csv formatdf.to_csv('dataframe.csv')