
<a href="https://colab.research.google.com/github/petuch03/data-science-things/blob/master/ml_with_python/binary_classification_problem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://www.kaggle.com/competitions/nup-ml-1-2023-competition/leaderboard" target="_parent"><img src="https://www.kaggle.com/static/images/site-logo.svg" alt="Kaggle Competition"/></a>

In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('train.csv')

for column in data.columns:
    if data[column].isna().sum() > 0:
        mean_value = data[column].mean()
        data[column].fillna(mean_value, inplace=True)

# Separate features and target
X = data.drop('target', axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Hyperparameters grid
param_grid = {
    'n_estimators': [100, 150, 200, 250, 300, 350, 400],  # Number of trees
    'max_features': ['sqrt'],  # Number of features to consider at every split
    'max_depth': [10, 20, 30, 40],  # Maximum number of levels in tree
    'random_state': [30, 35, 40, 45, 50, 55, 60]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
# best_model = model

# Print best parameters of the best_model
print("Best Parameters:", grid_search.best_params_)

# feature_importances = best_model.feature_importances_
# feature_names = X.columns
# feature_importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
# print(feature_importances_df.sort_values(by='Importance', ascending=False))

predictions = best_model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Fitting 5 folds for each of 196 candidates, totalling 980 fits
Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 200, 'random_state': 30}
Accuracy: 0.9


In [77]:
new_data = pd.read_csv('test.csv')

for column in new_data.columns:
    if new_data[column].isna().sum() > 0:
        mean_value = new_data[column].mean()
        new_data[column].fillna(mean_value, inplace=True)

new_data_scaled = scaler.transform(new_data.drop('Id', axis=1))
new_predictions = best_model.predict(new_data_scaled)

submission = pd.DataFrame({'target': new_predictions, 'Id': new_data['Id']})
submission.to_csv('submission.csv', index=False)