In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.dummy import DummyClassifier
from imblearn.metrics import classification_report_imbalanced

In [5]:
#Load datasets
def load_data():
    X_train_ = pd.read_csv('train_data.csv', header=None)
    y_labels = pd.read_csv('train_labels.csv', header=None)
    X_test_ = pd.read_csv('test_data.csv', header=None)
    return[X_train_, y_labels, X_test_]
X, y, X_test_data = load_data()

In [6]:
#Function with MinMaxScaler for feature scaling
def scale_data(df1, df2):
    
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(df1)
    X_test_scaled = scaler.fit_transform(df2)

    return [X_scaled, X_test_scaled]

In [7]:
#Run scaling features function
X_scaled, X_test_scaled = scale_data(X, X_test_data)

In [8]:
#Function with PCA
def pca_data(df1, df2):
    
    print('Dimension of training data before PCA:', X_scaled.shape)

    pca = PCA(n_components=0.95, random_state=42)
    X_pca = pca.fit_transform(df1)
    x_test_to_save = pca.fit_transform(df2)

    print('Dimension of training data after PCA:', X_pca.shape)
    
    return [X_pca, x_test_to_save]

In [9]:
#Run PCA and check dimension of data
X_pca, x_test_to_save = pca_data(X_scaled, X_test_scaled)

Dimension of training data before PCA: (3750, 10000)
Dimension of training data after PCA: (3750, 3070)


In [10]:
#Function for oversampling training data
def data_sampling(df1, df2):

    print('Dimension of training data before sampling:', X_pca.shape)
    print('Dimension of training labels before sampling:', y.shape)
    
    over_sampler = RandomOverSampler(sampling_strategy=0.2)

    x_to_save, y_to_save = over_sampler.fit_resample(df1, df2)

    print('Dimension of training data after sampling:', x_to_save.shape)
    print('Dimension of training labels after sampling:', X_pca.shape)
    
    return [x_to_save, y_to_save]

In [11]:
#Run oversampling and check dimension of data
x_to_save, y_to_save = data_sampling(X_pca, y)

Dimension of training data before sampling: (3750, 3070)
Dimension of training labels before sampling: (3750, 1)
Dimension of training data after sampling: (4050, 3070)
Dimension of training labels after sampling: (3750, 3070)


In [12]:
#Function for saving numpy arrays
def save_data(x_train, x_test, y_train):

    np.save('model_x_train', x_train)
    np.save('model_x_test', x_test)
    np.save('model_y_train', y_train)

In [13]:
#Save preprocessed data
save_data(x_to_save, x_test_to_save, y_to_save)

In [14]:
#Split training data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.33, random_state=42)

In [15]:
#Fit-predict with dummy classifier and print results with classification report
dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X_train, y_train)
DummyClassifier(strategy='stratified')
y_pred = dummy_clf.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         -1       0.11      0.11      0.90      0.11      0.32      0.09       123
          1       0.90      0.90      0.11      0.90      0.32      0.11      1115

avg / total       0.82      0.82      0.19      0.82      0.32      0.11      1238



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ae6d556f-9829-4201-b0d0-4c143a3266cf' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>