In [20]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV



In [21]:
DATA_PATH = "/home/matv864/it/AI_work/data/billionaires_ready.csv"

In [22]:
df = pd.read_csv(DATA_PATH)

In [23]:
X_reg = df.drop(columns=['category'])
y_reg = df['category']
y_reg.unique()

array(['Fashion & Retail', 'Automotive', 'Technology',
       'Finance & Investments', 'Media & Entertainment', 'Telecom',
       'Diversified', 'Food & Beverage', 'Logistics',
       'Gambling & Casinos', 'Manufacturing', 'Metals & Mining', 'Energy',
       'Healthcare', 'Service', 'Real Estate',
       'Construction & Engineering', 'Sports'], dtype=object)

In [24]:
le = LabelEncoder()
y_reg = le.fit_transform(y_reg)

In [25]:
# Разделяем на train и test
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

In [26]:
rf = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

xgb = XGBClassifier(random_state=42, eval_metric='mlogloss')
xgb_params = {
    'n_estimators':  [100, 200, 300],
    'max_depth': [None, 10, 20],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Без гиперпараметров

In [27]:
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("Accuracy RandomForest:", accuracy_score(y_test, rf_pred))

xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
print("Accuracy XGBoost:", accuracy_score(y_test, xgb_pred))

Accuracy RandomForest: 0.24027777777777778
Accuracy XGBoost: 0.2222222222222222


# Поиск по решётке

In [28]:
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)
print("Лучшие параметры для RandomForest:", rf_grid.best_params_)
print("Лучшая accuracy:", rf_grid.best_score_)

Лучшие параметры для RandomForest: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 300}
Лучшая accuracy: 0.27609275053304905


In [29]:
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
print("Лучшие параметры для XGBoost:", xgb_grid.best_params_)
print("Лучшая accuracy:", xgb_grid.best_score_)

Лучшие параметры для XGBoost: {'learning_rate': 0.01, 'max_depth': None, 'n_estimators': 200}
Лучшая accuracy: 0.2534363894811656


# Случайный поиск

In [30]:
rf_random = RandomizedSearchCV(rf, rf_params, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)
rf_random.fit(X_train, y_train)
print("Лучшие параметры для RandomForest:", rf_random.best_params_)
print("Лучшая accuracy:", rf_random.best_score_)


Лучшие параметры для RandomForest: {'n_estimators': 300, 'min_samples_split': 10, 'max_depth': 20}
Лучшая accuracy: 0.27609275053304905


In [31]:
xgb_random = RandomizedSearchCV(xgb, xgb_params, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)
xgb_random.fit(X_train, y_train)
print("Лучшие параметры для XGBoost:", xgb_random.best_params_)
print("Лучшая accuracy:", xgb_random.best_score_)

Лучшие параметры для XGBoost: {'n_estimators': 100, 'max_depth': None, 'learning_rate': 0.01}
Лучшая accuracy: 0.24865671641791046


# Байесов подход

In [32]:
rf_bayes = BayesSearchCV(
    rf,
    rf_params,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
rf_bayes.fit(X_train, y_train)
print("Лучшие параметры для RandomForest:", rf_bayes.best_params_)
print("Лучшая accuracy:", rf_bayes.best_score_)

Лучшие параметры для RandomForest: OrderedDict({'max_depth': None, 'min_samples_split': 10, 'n_estimators': 300})
Лучшая accuracy: 0.27549573560767593


In [33]:
xgb_bayes = BayesSearchCV(
    xgb,
    xgb_params,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
xgb_bayes.fit(X_train, y_train)
print("Лучшие параметры для XGBoost:", xgb_bayes.best_params_)
print("Лучшая accuracy:", xgb_bayes.best_score_)

Лучшие параметры для XGBoost: OrderedDict({'learning_rate': 0.01, 'max_depth': None, 'n_estimators': 300})
Лучшая accuracy: 0.2498560767590618
