In [None]:
!pip install catboost
!pip install --upgrade tensorflow
!pip install keras-tuner
!pip install tensorflow-gpu
!pip install tensorflow_addons

In [None]:
!nvidia-smi

Fri Dec 22 17:44:28 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils.metaestimators import if_delegate_has_method
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, ClassifierMixin
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from sklearn.decomposition import PCA
from tensorflow.keras.callbacks import EarlyStopping
#from tensorflow.keras.layers import Attention
from kerastuner.tuners import RandomSearch
from kerastuner import Objective
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.mixed_precision import set_global_policy
import tensorflow_addons as tfa
from tensorflow.keras.metrics import AUC, Precision, Metric
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Dropout, Flatten, BatchNormalization, Activation, GlobalAveragePooling1D
from tensorflow.keras.regularizers import l2

In [None]:
def load_data(filepath):
    return pd.read_csv(filepath)

In [None]:
def preprocess_data(df, is_training_data=True):
    df['Installation_zone'] = df['Installation_zone'].str.extract(r'(\d+)$')
    df['Installation_zone'] = df['Installation_zone'].astype(int)
    if is_training_data:
        df.drop('Consumer_number', axis=1, inplace=True)
    return df

In [None]:
def identify_outliers(df, group_col, target_col):
    Q1 = df.groupby(group_col)[target_col].quantile(0.25)
    Q3 = df.groupby(group_col)[target_col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    def is_outlier(row):
        consumption = row[target_col]
        type = row[group_col]
        return (consumption < lower_bound[type]) | (consumption > upper_bound[type])

    df['Outlier'] = df.apply(is_outlier, axis=1)
    return df

In [None]:
def balance_classes(X, y):
    smote = SMOTE(random_state=42, sampling_strategy='auto', n_jobs=-1, k_neighbors=1)
    X_sm, y_sm = smote.fit_resample(X, y)
    return X_sm, y_sm

In [None]:
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
def scale_data(X_train, X_test):
    scaler = StandardScaler()
    ''' X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled['Consumption'] = scaler.fit_transform(X_train[['Consumption']])
    X_test_scaled['Consumption'] = scaler.transform(X_test[['Consumption']])
    X_train_scaled['Installation_zone'] = scaler.fit_transform(X_train[['Installation_zone']])
    X_test_scaled['Installation_zone'] = scaler.transform(X_test[['Installation_zone']]) '''
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, scaler

In [None]:
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

In [None]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

In [None]:
def f1_score_macro(y_true, y_pred):
    y_pred = tf.cast(K.argmax(y_pred, axis=-1), 'float32')
    y_true = tf.cast(K.argmax(y_true, axis=-1), 'float32')

    f1_scores = []
    for i in range(num_classes):
        actual_positives = K.sum(K.cast(K.equal(y_true, i), 'float32'))
        predicted_positives = K.sum(K.cast(K.equal(y_pred, i), 'float32'))
        true_positives = K.sum(K.cast(K.equal(y_true, i) & K.equal(y_pred, i), 'float32'))

        precision = true_positives / (predicted_positives + K.epsilon())
        recall = true_positives / (actual_positives + K.epsilon())

        f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
        f1_scores.append(f1)

    f1_score_macro = K.mean(K.stack(f1_scores))

    return f1_score_macro


In [None]:
class PipelineWithFitParams(Pipeline):
    def fit(self, X, y=None, **fit_params):
        fit_params_steps = self._check_fit_params(**fit_params)
        for name, estimator in self.steps[:-1]:
            if estimator is not None:
                if fit_params_steps[name]:
                    estimator.fit(X, y, **fit_params_steps[name])
                else:
                    estimator.fit(X, y)
            if hasattr(estimator, 'transform'):
                X = estimator.transform(X)
        if fit_params_steps[self.steps[-1][0]]:
            self.steps[-1][1].fit(X, y, **fit_params_steps[self.steps[-1][0]])
        else:
            self.steps[-1][1].fit(X, y)
        return self

    @if_delegate_has_method(delegate='_final_estimator')
    def predict(self, X, **predict_params):
        for name, transform in self.steps[:-1]:
            if transform is not None:
                X = transform.transform(X)
        return self.steps[-1][1].predict(X, **predict_params)

In [None]:
class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight',
                                 shape=(input_shape[-1], 1),
                                 initializer='random_normal',
                                 trainable=True)
        self.b = self.add_weight(name='attention_bias',
                                 shape=(input_shape[1], 1),
                                 initializer='zeros',
                                 trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [None]:
class KerasRNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, input_shape, num_classes, epochs=10000, batch_size=4098):
        self.model = Sequential()

        self.model.add(Conv1D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=input_shape))
        self.model.add(MaxPooling1D(pool_size=1))

        self.model.add(LSTM(512, return_sequences=True))

        self.model.add(GlobalAveragePooling1D())
        self.model.add(Dropout(0.7))
        self.model.add(Flatten())
        self.model.add(Dense(256, activation='relu'))
        self.model.add(Dense(num_classes, activation='softmax'))

        self.model.compile(loss='sparse_categorical_crossentropy',
                           optimizer=Adam(learning_rate=1e-5),
                           metrics=[f1_score_macro])
        self.epochs = epochs
        self.batch_size = batch_size

    def fit(self, X, y, validation_data=None):
        early_stopping = EarlyStopping(monitor='val_loss', patience=100)
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size,
                       validation_data=validation_data, callbacks=[early_stopping])

    def predict(self, X):
        predictions = self.model.predict(X)
        return np.argmax(predictions, axis=1)

In [None]:
gpus = tf.config.list_physical_devices('GPU')
print("GPUs Disponíveis:", gpus)

GPUs Disponíveis: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
#tf.config.set_visible_devices(gpus[0], 'GPU')

In [None]:
df = load_data("https://raw.githubusercontent.com/luhenr/ML4DS/main/train.csv")

In [None]:
df_train = preprocess_data(df)

In [None]:
''' df_train['Spring'] = df_train['Month'].apply(lambda x: 1 if x in [3, 4, 5] else 0)
df_train['Summer'] = df_train['Month'].apply(lambda x: 1 if x in [6, 7, 8] else 0)
df_train['Autumn'] = df_train['Month'].apply(lambda x: 1 if x in [9, 10, 11] else 0)
df_train['Winter'] = df_train['Month'].apply(lambda x: 1 if x in [12, 1, 2] else 0)

df_train.drop(['Year', 'Month'], axis=1, inplace=True) '''

" df_train['Spring'] = df_train['Month'].apply(lambda x: 1 if x in [3, 4, 5] else 0)\ndf_train['Summer'] = df_train['Month'].apply(lambda x: 1 if x in [6, 7, 8] else 0)\ndf_train['Autumn'] = df_train['Month'].apply(lambda x: 1 if x in [9, 10, 11] else 0)\ndf_train['Winter'] = df_train['Month'].apply(lambda x: 1 if x in [12, 1, 2] else 0)\n\ndf_train.drop(['Year', 'Month'], axis=1, inplace=True) "

In [None]:
df_train.head()

Unnamed: 0,Year,Month,Consumer_type,Consumption,Installation_zone
0,2013,1,domestic,0,1
1,2013,1,industrial,5,2
2,2013,1,domestic,6,2
3,2013,1,domestic,1,2
4,2013,1,domestic,13,2


In [None]:
df_train.shape

(329975, 5)

In [None]:
X = df_train.drop(['Consumer_type'], axis=1)
y = df_train['Consumer_type']

In [None]:
X_sm, y_sm = balance_classes(X, y)
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X_sm, y_sm)

#pca = PCA(n_components=0.95)
#X_train_pca = pca.fit_transform(X_train)
#X_test_pca = pca.transform(X_test)

label_encoder = LabelEncoder()

X_train_scaled, X_test_scaled, scaler = scale_data(X_train, X_test)
X_val_scaled = scaler.transform(X_val)

y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)




In [None]:
display(X_train_scaled.shape)

(1190281, 4)

In [None]:
''' pipelines = {
    'LightGBM': Pipeline([('classifier', LGBMClassifier(verbose=0, random_state=42, n_jobs=-1))]),
    'CatBoost': Pipeline([('classifier', CatBoostClassifier(verbose=0, random_state=42))]),
    'rf': Pipeline([('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))]),
    'knn': Pipeline([('classifier', KNeighborsClassifier(n_jobs=-1))]),
    'dt': Pipeline([('classifier', DecisionTreeClassifier(random_state=42))]),
}

pipelines = {
    'LightGBM': Pipeline([('classifier', LGBMClassifier(verbose=0, random_state=42, n_jobs=-1))]),
    'CatBoost': Pipeline([('classifier', CatBoostClassifier(verbose=0, random_state=42))]),
    'rf': Pipeline([('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))]),
    'knn': Pipeline([('classifier', KNeighborsClassifier(n_jobs=-1))]),
    'dt': Pipeline([('classifier', DecisionTreeClassifier(random_state=42))]),
    'RNN': PipelineWithFitParams([('classifier', rnn_model)])
} '''

In [None]:
X_train_scaled_rnn = np.expand_dims(X_train_scaled, axis=2)
X_test_scaled_rnn = np.expand_dims(X_test_scaled, axis=2)
X_val_scaled_rnn = np.expand_dims(X_val_scaled, axis=2)

input_shape = (X_train_scaled_rnn.shape[1], 1)
num_classes = len(np.unique(y_train_encoded))

rnn_model = KerasRNNClassifier(input_shape=input_shape, num_classes=num_classes)

In [None]:
display(X_train_scaled_rnn.shape, X_val_scaled_rnn.shape, X_test_scaled_rnn.shape)

(1190281, 4, 1)

(132254, 4, 1)

(330634, 4, 1)

In [None]:
print(input_shape, num_classes)

(4, 1) 7


In [None]:
pipelines = {
    'RNN': PipelineWithFitParams([('classifier', rnn_model)]),
    'LightGBM': Pipeline([('classifier', LGBMClassifier(verbose=0, random_state=42, n_jobs=-1))]),
    'CatBoost': Pipeline([('classifier', CatBoostClassifier(verbose=0, random_state=42))]),
    'rf': Pipeline([('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))]),
    'knn': Pipeline([('classifier', KNeighborsClassifier(n_jobs=-1))]),
    'dt': Pipeline([('classifier', DecisionTreeClassifier(random_state=42))]),
}

In [None]:
best_f1_score = 0
best_model_name = None
best_model_pipeline = None

In [None]:
for model_name, pipeline in pipelines.items():
    if model_name == 'RNN':
        validation_data = (X_val_scaled_rnn, y_val_encoded)
        pipeline.fit(X_train_scaled_rnn, y_train_encoded, classifier__validation_data=validation_data)
        y_pred_rnn = pipeline.predict(X_test_scaled_rnn)
        f1_rnn = f1_score(y_test_encoded, y_pred_rnn, average='macro')
        print(f"RNN: Average F1-Score = {f1_rnn}")
        rnn_model.save('rnn_model.h5')
    else:
        pipeline.fit(X_train_scaled, y_train_encoded)
        y_pred = pipeline.predict(X_test_scaled)
        f1 = f1_score(y_test_encoded, y_pred, average='macro')
        print(f"{model_name}: Average F1-Score = {f1}")

print(f"Best Model: {best_model_name} with F1-Score = {best_f1_score}")

In [None]:
best_model_pipeline.fit(X_train_scaled, y_train_encoded)

y_pred = best_model_pipeline.predict(X_test_scaled)
f1 = f1_score(y_test_encoded, y_pred, average='macro')
print(f"{best_model_pipeline}: Average F1-Score with best model = {f1}")

Pipeline(steps=[('classifier',
                 DecisionTreeClassifier(max_depth=18, min_samples_leaf=2,
                                        min_samples_split=3,
                                        random_state=42))]): Average F1-Score with new Hiperparameters = 0.7472522449081197


In [None]:
def prepare_competition_data(filepath, preprocess_func, scaler):
    df_competition = load_data(filepath)
    consumer_numbers = df_competition['Consumer_number'].copy()
    df_competition_processed = preprocess_func(df_competition, is_training_data=False)
    df_competition_processed.drop('Consumer_number', axis=1, inplace=True)
    X_competition_scaled = scaler.transform(df_competition_processed)
    return X_competition_scaled, consumer_numbers

In [None]:
def predict_and_export(model, X_competition_scaled, consumer_numbers, filename):
    predictions = model.predict(X_competition_scaled)
    results_df = pd.DataFrame({'Consumer_number': consumer_numbers, 'Consumer_type': predictions})
    results_df.to_csv(filename, index=False)

In [None]:
X_competition, consumer_numbers = prepare_competition_data("https://raw.githubusercontent.com/luhenr/ML4DS/main/competition.csv", preprocess_data, scaler)
competition_predictions = best_model.predict(X_competition)
competition_predictions = label_encoder.inverse_transform(competition_predictions)

In [None]:
results_df = pd.DataFrame({'Consumer_number': consumer_numbers, 'Consumer_type': competition_predictions})
results_df.to_csv("competition_predictions.csv", index=False)