In [30]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import jaccard_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf
print(tf.test.is_gpu_available())
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True
Num GPUs Available:  1


In [3]:
from fastparquet import ParquetFile

# Parquet File okuma
pf = ParquetFile("train_final.parquet")
pf_test = ParquetFile("test_final.parquet")
pf_sub = ParquetFile("submission_sample_final.parquet")
# Bir pandas dataFrame'ine dönüştürme
df_train = pf.to_pandas()
df_test = pf_test.to_pandas()
df_sub = pf_sub.to_pandas()

#Csv olarak kaydetme
#dataFrame.to_csv("train_final.csv", index = False)

In [4]:
df_train.target

0        menu2, menu4, menu5
1        menu7, menu8, menu4
2        menu2, menu8, menu4
3        menu6, menu2, menu1
4        menu6, menu2, menu8
                ...         
94044    menu2, menu8, menu4
94045    menu9, menu2, menu5
94046    menu6, menu2, menu4
94047    menu6, menu8, menu4
94048    menu6, menu2, menu8
Name: target, Length: 94049, dtype: object

In [5]:
len(df_train.target.unique())

112

In [6]:
len(df_train.id.unique())

80478

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94049 entries, 0 to 94048
Data columns (total 58 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           94049 non-null  object 
 1   month        94049 non-null  int64  
 2   n_seconds_1  94049 non-null  float64
 3   n_seconds_2  94049 non-null  float64
 4   n_seconds_3  94049 non-null  float64
 5   carrier      94049 non-null  object 
 6   devicebrand  94049 non-null  object 
 7   feature_0    94049 non-null  float64
 8   feature_1    94049 non-null  float64
 9   feature_2    94049 non-null  float64
 10  feature_3    94049 non-null  float64
 11  feature_4    94049 non-null  float64
 12  feature_5    94049 non-null  float64
 13  feature_6    94049 non-null  float64
 14  feature_7    94049 non-null  float64
 15  feature_8    94049 non-null  float64
 16  feature_9    94049 non-null  float64
 17  feature_10   94049 non-null  float64
 18  feature_11   94049 non-null  float64
 19  feat

In [8]:
len(df_train.carrier.unique()),len(df_train.devicebrand.unique())

(556, 64)

In [9]:
len(df_test.carrier.unique()),len(df_test.devicebrand.unique())

(169, 42)

In [10]:
df_train['corr_48_49'] = df_train['feature_48'] + df_train['feature_49']
df_test['corr_48_49'] = df_test['feature_48'] + df_test['feature_49']

df_train['log+1'] = (df_train['feature_48']+1).transform(np.log)
df_test['log+1'] = (df_train['feature_48']+1).transform(np.log)

In [11]:
from sklearn.preprocessing import MinMaxScaler

# Min-Max ölçeklendirme nesnesini oluşturun
scaler48 = MinMaxScaler(feature_range=(-5, 5))
scaler49 = MinMaxScaler(feature_range=(-5, 5))
scalercorr = MinMaxScaler(feature_range=(-5, 5))
scaler_log = MinMaxScaler(feature_range=(-5, 5))

df_train['feature_48'] = scaler48.fit_transform(df_train[['feature_48']])
df_train['feature_49'] = scaler49.fit_transform(df_train[['feature_49']])
df_train['corr_48_49'] = scalercorr.fit_transform(df_train[['corr_48_49']])
df_train['log+1'] = scaler_log.fit_transform(df_train[['log+1']])


df_test['feature_48'] = scaler48.transform(df_test[['feature_48']])
df_test['feature_49'] = scaler49.transform(df_test[['feature_49']])
df_test['corr_48_49'] = scalercorr.transform(df_test[['corr_48_49']])
df_test['log+1'] = scaler_log.transform(df_test[['log+1']])

In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94049 entries, 0 to 94048
Data columns (total 60 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           94049 non-null  object 
 1   month        94049 non-null  int64  
 2   n_seconds_1  94049 non-null  float64
 3   n_seconds_2  94049 non-null  float64
 4   n_seconds_3  94049 non-null  float64
 5   carrier      94049 non-null  object 
 6   devicebrand  94049 non-null  object 
 7   feature_0    94049 non-null  float64
 8   feature_1    94049 non-null  float64
 9   feature_2    94049 non-null  float64
 10  feature_3    94049 non-null  float64
 11  feature_4    94049 non-null  float64
 12  feature_5    94049 non-null  float64
 13  feature_6    94049 non-null  float64
 14  feature_7    94049 non-null  float64
 15  feature_8    94049 non-null  float64
 16  feature_9    94049 non-null  float64
 17  feature_10   94049 non-null  float64
 18  feature_11   94049 non-null  float64
 19  feat

In [13]:
target_column = 'encoded_target'
numerical_columns = df_train.select_dtypes(exclude=['object']).columns.tolist()
categorical_columns = df_train.select_dtypes(include=['object']).columns.tolist()

In [15]:
"""for feature_column in numerical_columns:
    contingency_table = pd.crosstab(df_train['target'], df_train[feature_column])
    chi2, p, _, _ = chi2_contingency(contingency_table)

    # Elde edilen p-değeri ile ilişkinin anlamlılığını değerlendirin
    if p < 0.05:
        print(f"{feature_column} kategorik hedef ile anlamlı bir ilişkiye sahip.")
    else:
        print(f"{feature_column} kategorik hedef ile anlamlı bir ilişkiye sahip değil.")"""

'for feature_column in numerical_columns:\n    contingency_table = pd.crosstab(df_train[\'target\'], df_train[feature_column])\n    chi2, p, _, _ = chi2_contingency(contingency_table)\n\n    # Elde edilen p-değeri ile ilişkinin anlamlılığını değerlendirin\n    if p < 0.05:\n        print(f"{feature_column} kategorik hedef ile anlamlı bir ilişkiye sahip.")\n    else:\n        print(f"{feature_column} kategorik hedef ile anlamlı bir ilişkiye sahip değil.")'

In [14]:
correlations = {}
for feature_column in numerical_columns:
    correlation = df_train['month'].corr(df_train[feature_column])
    correlations[feature_column] = correlation
    sorted_correlations = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)

# En yüksek korelasyona sahip sütunları gösterin
top_correlations = sorted_correlations[:4]  # İlk 10 sütunu alabilirsiniz
print("Kategorik Hedef ile En Yüksek Korelasyona Sahip Sütunlar:")
for column, correlation in top_correlations:
    print(f"{column}: {correlation}")



Kategorik Hedef ile En Yüksek Korelasyona Sahip Sütunlar:
month: 1.0
feature_45: -0.3570091567816251
feature_28: 0.24762530966690724
feature_34: -0.2356685640655781


In [15]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

In [17]:
print("Top Absolute Correlations")
top_abs_corr = get_top_abs_correlations(df_train.drop(columns=['id','carrier','devicebrand','target']), 12)
print(top_abs_corr)

Top Absolute Correlations
feature_48   log+1          0.943166
feature_49   corr_48_49     0.936150
feature_48   corr_48_49     0.836542
corr_48_49   log+1          0.769881
n_seconds_2  n_seconds_3    0.618358
feature_48   feature_49     0.590486
n_seconds_1  n_seconds_2    0.583850
feature_20   feature_21     0.581451
feature_9    feature_21     0.560666
feature_11   feature_44     0.541293
feature_49   log+1          0.528766
feature_8    feature_34     0.520911
dtype: float64


In [18]:
carrier_counts = df_train['carrier'].value_counts()
# Frekansı 400'den küçük olanları "low" olarak işaretleyin
low_carrier_mask = carrier_counts <= 400#400-20
low_carriers = carrier_counts[low_carrier_mask].index.tolist()
df_train.loc[df_train['carrier'].isin(low_carriers), 'carrier'] = 'other'

carrier_counts = df_test['carrier'].value_counts()
# Frekansı 400'den küçük olanları "low" olarak işaretleyin
low_carrier_mask = carrier_counts < 50#50-3
low_carriers = carrier_counts[low_carrier_mask].index.tolist()
df_test.loc[df_test['carrier'].isin(low_carriers), 'carrier'] = 'other'

In [19]:
df_train['carrier'] = df_train['carrier'].str.replace(' ', '_')
#df_train['carrier'] = df_train['carrier'].str.replace('TR', '')

df_test['carrier'] = df_test['carrier'].str.replace(' ', '_')
#df_test['carrier'] = df_test['carrier'].str.replace('TR', '')

In [20]:
devicebrand_counts = df_train['devicebrand'].value_counts()
# Frekansı 400'den küçük olanları "low" olarak işaretleyin
devicebrand_mask = devicebrand_counts <= 400#400
devicebrand = devicebrand_counts[devicebrand_mask].index.tolist()
df_train.loc[df_train['devicebrand'].isin(devicebrand), 'devicebrand'] = 'other'

devicebrand_counts = df_test['devicebrand'].value_counts()
# Frekansı 400'den küçük olanları "low" olarak işaretleyin
devicebrand_mask = devicebrand_counts < 50#50
devicebrand = devicebrand_counts[devicebrand_mask].index.tolist()
df_test.loc[df_test['devicebrand'].isin(devicebrand), 'devicebrand'] = 'other'

In [21]:
# Verilen string
def get_binary_target(menu_str):
    
    # Virgülle ayrılmış menüleri bir liste olarak elde etmek için split() kullanabiliriz.
    menu_list = menu_str.split(",")
    #print(menu_list)
    
    # Sadece sayıları içeren bir liste oluşturmak için menü isimlerinden "menü" kısmını kaldıralım.
    numbers_list = [menu.strip().replace("menu", "") for menu in menu_list]

    # Elde edilen sayılar listesi
    #print(numbers_list)

    binary_list = [format(int(number), '04b') for number in numbers_list]

    # Elde edilen binary sayılar listesi
    #print(binary_list)

    binary_string = ''.join(binary_list)

    # Elde edilen binary dize
    #print(binary_string) 
    return binary_string
    
df_train['encoded_target'] = df_train['target'].apply(get_binary_target)    

In [23]:
df_train = pd.get_dummies(df_train, columns=['carrier', 'devicebrand'], drop_first=True)
df_test = pd.get_dummies(df_test, columns=['carrier', 'devicebrand'], drop_first=True)

In [24]:
# Bağımsız değişkenleri ve hedef değişkeni ayırın
X = df_train.drop(['id','target','encoded_target'], axis=1)

In [31]:
"""from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


# Eğitim ve test verilerini ayırın
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# XGBoost modelini oluşturun
xgb_model = XGBClassifier()

# GridSearchCV için hiperparametrelerin olası değerlerini belirtin
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# GridSearchCV ile modeli eğitim verilerine uyarlayın
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=3)
grid_search.fit(X_train, y_train)

# En iyi modeli bulun
best_model = grid_search.best_estimator_

# Test verilerini kullanarak modelin performansını değerlendirin
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("En iyi modelin doğruluk skoru:", accuracy)

# En iyi modelin hiperparametrelerini görüntüleyin
print("En iyi modelin hiperparametreleri:", grid_search.best_params_)"""

'from sklearn.model_selection import train_test_split, GridSearchCV\nfrom xgboost import XGBClassifier\nfrom sklearn.metrics import accuracy_score\n\n\n# Eğitim ve test verilerini ayırın\nX_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)\n\n# XGBoost modelini oluşturun\nxgb_model = XGBClassifier()\n\n# GridSearchCV için hiperparametrelerin olası değerlerini belirtin\nparam_grid = {\n    \'n_estimators\': [100, 200, 300],\n    \'max_depth\': [3, 4, 5],\n    \'learning_rate\': [0.1, 0.01, 0.001],\n    \'subsample\': [0.8, 0.9, 1.0],\n    \'colsample_bytree\': [0.8, 0.9, 1.0]\n}\n\n# GridSearchCV ile modeli eğitim verilerine uyarlayın\ngrid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring=\'accuracy\', cv=3)\ngrid_search.fit(X_train, y_train)\n\n# En iyi modeli bulun\nbest_model = grid_search.best_estimator_\n\n# Test verilerini kullanarak modelin performansını değerlendirin\ny_pred = best_model.predict(X_test)\naccu

In [26]:
X = df_train.drop(['id',"target",'encoded_target'], axis=1)#,'month'
y = df_train["encoded_target"] #df_train["target"]

In [27]:
from sklearn.preprocessing import LabelEncoder

# Hedef değişkeni etiket kodlama ile dönüştürün
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

In [28]:
"""
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
"""

'\nfrom sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)\n'

In [35]:
import catboost as cb
from sklearn.metrics import mean_squared_error
import optuna

# K-Fold çapraz doğrulama ayarları
n_splits = 5
kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

def objective(trial, X=X, y=y_encoded):
    
    params = {
    "iterations": 1000,#trial.suggest_int("iterations", 1000, 3000),
    "subsample": trial.suggest_float("subsample", 0.5, 0.99),
    'od_wait': trial.suggest_int('od_wait', 10, 50, step=1),#
    "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
    "depth": trial.suggest_int("depth", 1, 10),
    "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 50.0),#
    "random_strength": trial.suggest_int("random_strength", 1, 10, step=1),#
    #"max_depth": trial.suggest_int("max_depth", 4, 10, step=1),
    "max_bin": trial.suggest_categorical("max_bin", [32, 64, 255]), #
    "bootstrap_type": 'Bernoulli'#trial.suggest_categorical("bootstrap_type", ['Bernoulli', 'Poisson'])
}

    
    model = cb.CatBoostClassifier(**params, silent=True,task_type="GPU", devices="0:1", loss_function="MultiClass", rsm=1)
    jaccard_scores = []
    
    for train_index, test_index in kf.split(X, y):
        #print(train_index,test_index)
        X_train, X_test =X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

        model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, verbose=100, early_stopping_rounds=50)
        y_pred = model.predict(X_test, prediction_type='Class')
        
        jaccard = jaccard_score(y_test, y_pred, average='micro')
        jaccard_scores.append(jaccard)

    return np.mean(jaccard_scores)


In [48]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

[I 2023-10-13 01:07:39,682] A new study created in memory with name: no-name-e2862099-e6cf-4634-b1fc-7c972a8214bf


0:	learn: 4.7121607	test: 4.7123028	best: 4.7123028 (0)	total: 597ms	remaining: 9m 56s
100:	learn: 4.2535882	test: 4.2658204	best: 4.2658204 (100)	total: 58.2s	remaining: 8m 37s


[W 2023-10-13 01:09:33,517] Trial 0 failed with parameters: {'subsample': 0.5572495076247349, 'od_wait': 43, 'learning_rate': 0.004055388842936129, 'depth': 8, 'min_data_in_leaf': 52, 'l2_leaf_reg': 22.602623060867913, 'random_strength': 10, 'max_bin': 64} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "c:\Users\necme\AppData\Local\Programs\Python\Python39\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\necme\AppData\Local\Temp\ipykernel_5696\2367257373.py", line 34, in objective
    model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, verbose=100, early_stopping_rounds=50)
  File "c:\Users\necme\AppData\Local\Programs\Python\Python39\lib\site-packages\catboost\core.py", line 5100, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "c:\Users

KeyboardInterrupt: 

In [None]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

In [29]:
import catboost as cb

kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
best_params = {
    "iterations": 1000,
    'subsample': 0.9389133536514731,
    'od_wait': 29,
    'learning_rate': 0.0066266955993544933,
    'depth': 5,
    'min_data_in_leaf': 70,
    'l2_leaf_reg': 22.95368957722943,
    'random_strength': 9,
    'max_bin': 255,
    'bootstrap_type': 'Bernoulli'  # "Bernoulli" olarak ayarlayın
}

jaccard_scores_micro=[]
jaccard_scores_weighted=[]
jaccard_scores_macro=[]

for train_index, test_index in kf.split(X, y_encoded):
    #print(train_index,test_index)
    X_train, X_test =X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = [y_encoded[i] for i in train_index], [y_encoded[i] for i in test_index]
    
    model = cb.CatBoostClassifier(**best_params, silent=True, task_type="GPU", devices="0:1", loss_function="MultiClass", rsm=1)
    model.fit(X_train, y_train, use_best_model=True, verbose=100, early_stopping_rounds=50)
    
    #model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, verbose=100, early_stopping_rounds=50)
    y_pred = model.predict(X_test, prediction_type='Class')
    print(y_pred)
    print('---------------')
    print(y_test)
    #y_pred = encoder.inverse_transform(y_pred)
    for param,jlist in zip(['weighted', 'micro', 'macro'],[jaccard_scores_micro,jaccard_scores_weighted,jaccard_scores_macro]):
        score = jaccard_score(y_test, y_pred,  average=param)  
        print(param, " jaccard_score=",score)
        jlist.append(score)   

print(f"micro = {np.mean(jaccard_scores_micro)}, weighted = {np.mean(jaccard_scores_weighted)}, macro = {np.mean(jaccard_scores_macro)}")

#model = cb.CatBoostClassifier(**best_params, silent=True, task_type="GPU", devices="0:1", loss_function="MultiClass", rsm=1)
#model.fit(X, y_encoded, use_best_model=True, verbose=100, early_stopping_rounds=50)


You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 4.6913323	total: 742ms	remaining: 12m 20s
100:	learn: 3.7475270	total: 29.3s	remaining: 4m 20s
200:	learn: 3.5040742	total: 57.7s	remaining: 3m 49s
300:	learn: 3.3927702	total: 1m 21s	remaining: 3m 9s
400:	learn: 3.3390288	total: 1m 39s	remaining: 2m 28s
500:	learn: 3.3144602	total: 1m 57s	remaining: 1m 56s
600:	learn: 3.3026907	total: 2m 19s	remaining: 1m 32s
700:	learn: 3.2969882	total: 2m 40s	remaining: 1m 8s
800:	learn: 3.2932087	total: 2m 58s	remaining: 44.4s
900:	learn: 3.2906153	total: 3m 17s	remaining: 21.7s
999:	learn: 3.2887477	total: 3m 35s	remaining: 0us
[[37]
 [37]
 [37]
 ...
 [38]
 [39]
 [40]]
---------------
[40, 41, 41, 40, 93, 41, 39, 38, 100, 57, 39, 38, 36, 85, 38, 84, 5, 87, 16, 4, 38, 37, 1, 5, 37, 8, 97, 83, 41, 86, 38, 37, 38, 38, 37, 53, 37, 87, 38, 16, 5, 59, 5, 85, 42, 52, 6, 88, 2, 37, 85, 87, 37, 15, 38, 38, 5, 38, 58, 38, 38, 86, 38, 5, 39, 38, 38, 85, 68, 5, 5, 37, 84, 37, 4, 38, 7, 41, 41, 9, 41, 39, 86, 34, 39, 85, 38, 40, 38, 40, 39, 37, 38, 8

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 4.6908108	total: 324ms	remaining: 5m 24s
100:	learn: 3.7477410	total: 32.2s	remaining: 4m 46s
200:	learn: 3.5064521	total: 1m 4s	remaining: 4m 16s
300:	learn: 3.3936170	total: 1m 30s	remaining: 3m 29s
400:	learn: 3.3398912	total: 1m 50s	remaining: 2m 44s
500:	learn: 3.3164314	total: 2m 9s	remaining: 2m 8s
600:	learn: 3.3056793	total: 2m 27s	remaining: 1m 37s
700:	learn: 3.3000575	total: 2m 46s	remaining: 1m 10s
800:	learn: 3.2965040	total: 3m 4s	remaining: 45.9s
900:	learn: 3.2941213	total: 3m 23s	remaining: 22.3s
999:	learn: 3.2922301	total: 3m 41s	remaining: 0us
[[38]
 [37]
 [39]
 ...
 [38]
 [40]
 [38]]
---------------
[3, 37, 37, 1, 1, 39, 9, 37, 39, 49, 43, 103, 5, 38, 34, 37, 53, 5, 6, 37, 50, 55, 37, 38, 37, 10, 5, 88, 5, 86, 41, 40, 43, 50, 2, 39, 6, 55, 85, 5, 39, 39, 53, 39, 82, 5, 1, 38, 37, 40, 1, 35, 90, 35, 85, 38, 84, 58, 85, 39, 85, 5, 84, 5, 38, 15, 37, 2, 87, 17, 50, 88, 41, 50, 38, 38, 37, 39, 38, 7, 38, 37, 4, 40, 37, 38, 37, 37, 96, 38, 38, 37, 38, 50, 5, 

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 4.6904515	total: 331ms	remaining: 5m 31s
100:	learn: 3.7460592	total: 32.8s	remaining: 4m 51s
200:	learn: 3.5048022	total: 1m 4s	remaining: 4m 16s
300:	learn: 3.3925112	total: 1m 31s	remaining: 3m 33s
400:	learn: 3.3381661	total: 1m 52s	remaining: 2m 47s
500:	learn: 3.3140353	total: 2m 11s	remaining: 2m 10s
600:	learn: 3.3027793	total: 2m 29s	remaining: 1m 39s
700:	learn: 3.2970921	total: 2m 48s	remaining: 1m 11s
800:	learn: 3.2934442	total: 3m 6s	remaining: 46.5s
900:	learn: 3.2909431	total: 3m 25s	remaining: 22.6s
999:	learn: 3.2889138	total: 3m 43s	remaining: 0us
[[37]
 [37]
 [37]
 ...
 [38]
 [39]
 [40]]
---------------
[5, 8, 37, 41, 38, 67, 40, 5, 5, 37, 85, 37, 40, 37, 2, 5, 15, 84, 101, 38, 8, 37, 39, 37, 50, 34, 53, 43, 38, 61, 88, 8, 50, 40, 37, 85, 40, 37, 7, 68, 87, 64, 39, 51, 4, 38, 37, 84, 37, 40, 5, 39, 20, 38, 38, 40, 5, 53, 85, 87, 41, 5, 5, 87, 37, 39, 101, 37, 51, 5, 58, 38, 68, 15, 37, 4, 39, 85, 37, 40, 37, 39, 1, 85, 41, 38, 41, 38, 111, 9, 50, 43, 38, 1

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 4.6906207	total: 329ms	remaining: 5m 29s
100:	learn: 3.7523640	total: 32s	remaining: 4m 44s
200:	learn: 3.5057973	total: 1m 3s	remaining: 4m 12s
300:	learn: 3.3935827	total: 1m 31s	remaining: 3m 32s
400:	learn: 3.3394985	total: 1m 51s	remaining: 2m 46s
500:	learn: 3.3158821	total: 2m 10s	remaining: 2m 9s
600:	learn: 3.3040293	total: 2m 29s	remaining: 1m 39s
700:	learn: 3.2978018	total: 2m 47s	remaining: 1m 11s
800:	learn: 3.2940806	total: 3m 6s	remaining: 46.3s
900:	learn: 3.2913086	total: 3m 24s	remaining: 22.5s
999:	learn: 3.2893003	total: 3m 42s	remaining: 0us
[[38]
 [37]
 [39]
 ...
 [40]
 [38]
 [38]]
---------------
[75, 67, 5, 85, 41, 37, 37, 37, 41, 38, 84, 2, 5, 50, 84, 97, 85, 40, 87, 40, 5, 38, 34, 37, 38, 39, 41, 4, 43, 52, 87, 38, 15, 34, 37, 37, 39, 43, 38, 59, 38, 15, 15, 64, 99, 34, 84, 90, 97, 5, 40, 38, 82, 39, 4, 38, 39, 41, 103, 37, 38, 64, 37, 38, 6, 37, 37, 38, 1, 37, 5, 15, 41, 3, 41, 38, 84, 90, 84, 39, 4, 87, 50, 98, 87, 37, 18, 41, 8, 37, 91, 68, 40, 5

In [None]:
#nonefe  micro = 0.06721261703756036, weighted = 0.11644868018491901, macro = 0.005086400259969646
#48+49   micro = 0.0672992256168072, weighted = 0.11652800948640081, macro = 0.005094674203180238
#48+49+2000 iteration micro = 0.0707430630834861, weighted = 0.1193591927002253, macro = 0.0054186431040227955


In [30]:
y_encoded

array([ 5, 75,  8, ..., 38, 59, 40])

In [37]:
df_test = df_test.drop(['id'],axis=1)
y_pred = model.predict(df_test)
y_pred = encoder.inverse_transform(y_pred)

In [38]:
lists = []
for pred in y_pred:
    temp_list=[]
    for i in range(0,len(pred),4):
        decimal_num = int(pred[i:i+4], 2)
        input = 'menu' + str(decimal_num)
        temp_list.append(input)
    lists.append(temp_list)

In [39]:
ikili_veriler = []
for satir in lists:
    ikili_dize = ""
    for menu in range(1, 10):  # Menu numaralarını 1'den 8'e kadar alın
        if f'menu{menu}' in satir:
            ikili_dize += "1"
        else:
            ikili_dize += "0"
    ikili_veriler.append(ikili_dize)

# İkili verileri görüntüleyin
#for ikili_dize in ikili_veriler:
#    print(ikili_dize)

In [41]:
pf_sub = ParquetFile("submission_sample_final.parquet")
df_sub = pf_sub.to_pandas()
df_sub.target = ikili_veriler
df_sub.to_csv('df.csv',index=False)