In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Hinge

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
import string

from nltk.corpus import stopwords

import numpy as np
import pandas as pd
import keras_tuner as kt
from kerastuner import HyperParameters
import torch
from kerastuner.tuners import RandomSearch

2023-05-12 22:26:53.543696: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from kerastuner import HyperParameters
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define SVM model
def svm_model(hp):
    model = tf.keras.Sequential()
    model.add(Dense(units=hp.Int('units', 64, 256, step=32), activation='relu', kernel_regularizer=regularizers.l2(hp.Float('l2', 0.001, 0.01, step=0.001))))
    model.add(Dense(units=1, activation='linear'))
    model.compile(optimizer=Adam(hp.Choice('learning_rate', values=[1e-3, 1e-4])), loss=Hinge(), metrics=['accuracy'])
    return model

def clean_text(text):
    ''' 
    Some basic text cleaning phases this function should be applied to the text column
    of the data
    '''
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = set(stopwords.words('english'))
    stop = [w for w in stop if w not in ['not', 'no']]
    text = [x for x in text if (x not in stop)]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    # tokenize the text
    #text = text.split()
    return(text)

def apply_cleaning(X_train):
    '''
    With this fonction we apply the clean_text function to the text column of the data
    and it gives as output the cleaned data for each row of the text column
    '''
    X_train = X_train.apply(lambda x: clean_text(x))
    return(X_train)

In [3]:
train = pd.read_csv('train_data.csv', nrows=1000)
X_train, X_test, y_train, y_test = train_test_split(train["sentences"], train["labels"], test_size = 0.10, random_state = 42)
del train
X_train = apply_cleaning(X_train)
X_test = apply_cleaning(X_test)

In [4]:
# Convert text data to TF-IDF features
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [5]:
# Convert data to TensorFlow Dataset format
train_ds = tf.data.Dataset.from_tensor_slices((X_train_tfidf.toarray(), y_train))
test_ds = tf.data.Dataset.from_tensor_slices((X_test_tfidf.toarray(), y_test))


2023-05-12 22:27:08.960107: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:966] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-12 22:27:08.966257: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:966] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-12 22:27:08.966335: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:966] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-12 22:27:08.968447: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow wi

In [6]:
# Define hyperparameter search space
hp = HyperParameters()
hp.Int('tfidf_max_features', 3000, 7000, sampling=None)
hp.Choice('svc_C', [0.1, 1.0, 10.0])
hp.Choice('svc_kernel', ['linear', 'rbf'])

# Define the tuner
tuner = RandomSearch(
    svm_model(hp),
    objective='val_accuracy',
    max_trials=10,
    hyperparameters=hp,
    seed=42,
    directory='my_dir',
    project_name='svm_tuning')

# Perform hyperparameter tuning
tuner.search(train_ds.batch(32), validation_data=test_ds.batch(32), epochs=3)

# Get best hyperparameters and train SVM model on full dataset
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
model = svm_model(best_hps)
model.fit(X_train_tfidf.toarray(), y_train) 

INFO:tensorflow:Reloading Oracle from existing project my_dir/svm_tuning/oracle.json


ValueError: `sampling` can only be set on an `Int` when `step=1`.

: 

In [9]:
# Get all the trial summaries as a list of dictionaries
trial_summaries = tuner.results_summary()

# Convert the list of dictionaries to a pandas dataframe
df = pd.DataFrame.from_records(trial_summaries)

# Display the dataframe
display(df)


NameError: name 'tuner' is not defined