In [None]:
import pandas as pd
import os

# Load the dataset
file_path = 'D:\\1 code AI'
file_names = ['new_data_v1.csv']
datasets = [pd.read_csv(os.path.join(file_path, file)) for file in file_names]
dataset = pd.concat(datasets, ignore_index=True)

# Count the number of labels 0 and 1
label_counts = dataset['Label'].value_counts()
print("Label counts:")
print(label_counts)

dataset.info()


In [None]:
from sklearn.model_selection import train_test_split

# Check if columns exist before dropping
columns_drop = ["Label", "Label", "year", "month", "day", "hour"]
existing_columns_to_drop = [col for col in columns_drop if col in dataset.columns]

X = dataset.drop(columns=existing_columns_to_drop)
y = dataset["Label"]

print(X.head())
print(X.info())
print(y.head())
print(y.value_counts())

In [None]:
# Split the data into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

print("Training set label distribution:")
print(y_train.value_counts())
print("Test set label distribution:")
print(y_test.value_counts())


In [None]:
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid for Randomized Search
param_dist = {
    'n_estimators': np.arange(50, 601, 10),
    'max_depth': [None, 10, 50, 100],
    'min_samples_split': np.arange(2, 21),
    'min_samples_leaf': np.arange(1, 21),
    'max_features': ['sqrt', 'log2', None],
    'class_weight': ['balanced', 'balanced_subsample']
}

# Define the model
rf = RandomForestClassifier(random_state=42)


# Define the Repeated Stratified K-Fold cross-validator
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform Randomized Search with Repeated Stratified K-Fold cross-validation
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, 
                               n_iter=100, cv=skf, verbose=10, random_state=42, 
                               n_jobs=-1, scoring='f1')

# Fit the Randomized Search model on the training set
rf_random.fit(X_train, y_train)

print("Best Parameters from Randomized Search:", rf_random.best_params_)
# Print the score of the best parameters on the test set
best_score = rf_random.score(X_test, y_test)
print("Best score on test set:", best_score)
# Get the best model based on validation performance
best_rf = rf_random.best_estimator_


In [None]:
from sklearn.metrics import classification_report

y_pred = best_rf.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)