In [5]:
import pandas as pd
import numpy as np
from category_encoders.one_hot import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from category_encoders.leave_one_out import LeaveOneOutEncoder
import re
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.utils.class_weight import compute_class_weight
from sklearn.decomposition import PCA

warnings.simplefilter("ignore")
%matplotlib inline

In [6]:
df_train = pd.read_csv('df_train_copy.csv')

In [3]:
df_train['class'].value_counts()

class
p    1705396
e    1411549
Name: count, dtype: int64

In [7]:
y_train = df_train["class"]                  # Таргет тренировочного датасета
y_train = y_train.map({"p":0, "e":1})

In [8]:
X_train = df_train.drop(["class"], axis=1)   # Тренировочный датасет

In [5]:
encoder = OneHotEncoder(return_df=True)
X_train_2 = encoder.fit_transform(X_train,y_train)

In [8]:
X_train_2.to_csv('X_train_2.csv',index=False)

In [6]:
# Получение весов классов
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

In [7]:
dt =  DecisionTreeClassifier(criterion='entropy', max_depth=6, min_samples_split=3, random_state=18, class_weight=dict(enumerate(class_weights)))

In [6]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_train_2, y_train, test_size=0.3, random_state=42)

In [55]:
dt.fit(Xtrain, ytrain)

In [12]:
y_train_dt = dt.predict(Xtrain)

In [13]:
y_test_dt = dt.predict(Xtest)

In [14]:
print(f"Decision tree accuracy train: {accuracy_score(y_train_dt, ytrain):.8f}")
print(f"Decision tree accuracy test: {accuracy_score(y_test_dt, ytest):.8f}")

Decision tree accuracy train: 0.72584871
Decision tree accuracy test: 0.72599574


In [15]:
print(round(matthews_corrcoef( ytrain,y_train_dt),5))

0.4682


In [16]:
print(round(matthews_corrcoef( ytest,y_test_dt),5))

0.46847


In [71]:
df_test = pd.read_csv('df_copy_test_.csv')

In [72]:
df_test

Unnamed: 0.1,Unnamed: 0,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,habitat,season
0,0,8.64,x,n,t,w,11.13,17.12,w,t,d,a
1,1,6.90,o,o,f,y,1.27,10.75,n,f,d,a
2,2,2.00,b,n,f,n,6.18,3.14,n,f,d,s
3,3,3.47,x,n,f,n,4.98,8.51,w,t,d,u
4,4,6.17,x,y,f,y,6.73,13.70,y,t,d,u
...,...,...,...,...,...,...,...,...,...,...,...,...
2077959,2077959,0.88,x,w,f,w,2.67,1.35,e,f,d,u
2077960,2077960,3.12,x,w,f,w,2.69,7.38,w,f,g,a
2077961,2077961,5.73,x,e,f,w,6.16,9.74,y,t,d,a
2077962,2077962,5.03,b,n,f,g,6.00,3.46,g,f,d,a


In [25]:
y_test_kaggle_dt = dt.predict(X_test_2)

In [30]:
print(f"Decision tree accuracy train: {accuracy_score(y_test_kaggle_dt, predict_np):.8f}")

ValueError: Classification metrics can't handle a mix of binary and multiclass-multioutput targets

In [104]:
print(round(matthews_corrcoef( predict_np,y_test_kaggle_dt),5))

0.3099


In [74]:
X_test_2 = encoder.fit_transform(df_test)

In [21]:
predict = pd.read_csv('predict_kaggle.csv')

In [65]:
predict = predict.replace('e',1)

In [66]:
predict = predict.replace('p',0)

In [67]:
predict

Unnamed: 0,id,0
0,3116945,1
1,3116946,0
2,3116947,0
3,3116948,0
4,3116949,1
...,...,...
2077959,5194904,0
2077960,5194905,0
2077961,5194906,0
2077962,5194907,1


In [63]:
predict_np = np.array(predict)

In [28]:
df_train_kaggle = pd.read_csv('train.csv')

In [29]:
df_train_kaggle['class'].value_counts()

class
p    1705396
e    1411549
Name: count, dtype: int64

In [7]:
df_train_kaggle

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.80,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,3116940,e,9.29,f,,n,t,,,w,...,b,,w,u,w,t,g,,d,u
3116941,3116941,e,10.88,s,,w,t,d,c,p,...,,,w,,,f,f,,d,u
3116942,3116942,p,7.82,x,e,e,f,a,,w,...,,,y,,w,t,z,,d,a
3116943,3116943,e,9.45,p,i,n,t,e,,p,...,,y,w,,,t,p,,d,u


In [31]:
predict['0'].value_counts()

0
0    1133269
1     944695
Name: count, dtype: int64

In [32]:
predict

Unnamed: 0,id,0
0,3116945,1
1,3116946,0
2,3116947,0
3,3116948,0
4,3116949,1
...,...,...
2077959,5194904,0
2077960,5194905,0
2077961,5194906,0
2077962,5194907,1


In [33]:
df_train_kaggle = pd.read_csv('train.csv')

In [34]:
df_train_kaggle['class'].value_counts()

class
p    1705396
e    1411549
Name: count, dtype: int64

## Метод главных компонент

In [36]:
dec = PCA()
# dec.fit(X_train_2) # обучаем PCA

In [15]:
faces_images = mean_face
red.fit(faces_images)

array([[-1.00000000e+00, -1.50581450e-09, -3.22055627e-11, ...,
         2.88106907e-11, -6.58908605e-10, -1.12364781e-10],
       [-3.88054989e-09,  4.36595343e-01,  5.35802376e-04, ...,
         1.58587753e-03, -2.51193919e-03,  3.06978216e-04],
       [ 2.60963897e-10,  8.27376628e-01,  4.96766311e-03, ...,
         1.98228718e-03,  2.22359163e-03, -1.28777320e-03],
       ...,
       [ 0.00000000e+00, -8.24008904e-17, -2.58721724e-02, ...,
         5.54835407e-02,  5.54835407e-02,  5.54835407e-02],
       [ 0.00000000e+00,  5.63340437e-18, -3.57286967e-02, ...,
        -1.12124842e-02, -1.12124842e-02, -1.12124842e-02],
       [ 0.00000000e+00,  5.56767462e-17,  3.40470181e-02, ...,
         4.28972557e-01,  4.28972557e-01,  4.28972557e-01]])

In [40]:
# X_train_pca = dec.fit(Xtrain)    

In [52]:
X_train_pca = dec.fit_transform(Xtrain)   # обучаем PCA

In [56]:
y_train_predict_pca = dt.predict(X_train_pca)

In [57]:
X_test_pca = dec.transform(Xtest)

In [58]:
y_test_predict_pca = dt.predict(X_test_pca)

In [59]:
print(f"Decision tree accuracy train: {accuracy_score(ytrain,y_train_predict_pca):.8f}")

Decision tree accuracy train: 0.54617182


In [60]:
print(f"Decision tree accuracy train: {accuracy_score(y_test_predict_pca, ytest):.8f}")

Decision tree accuracy train: 0.54615414


In [61]:
print(round(matthews_corrcoef(ytest, y_test_predict_pca),5))

-0.00618


In [62]:
print(round(matthews_corrcoef(y_test_predict_pca,ytest ),5))

-0.00618


In [68]:
predict

Unnamed: 0,id,0
0,3116945,1
1,3116946,0
2,3116947,0
3,3116948,0
4,3116949,1
...,...,...
2077959,5194904,0
2077960,5194905,0
2077961,5194906,0
2077962,5194907,1


In [75]:
X_test_2

Unnamed: 0.1,Unnamed: 0,cap-diameter,cap-shape_1,cap-shape_2,cap-shape_3,cap-shape_4,cap-shape_5,cap-shape_6,cap-shape_7,cap-shape_8,...,habitat_18,habitat_19,habitat_20,habitat_21,habitat_22,habitat_23,season_1,season_2,season_3,season_4
0,0,8.64,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,6.90,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,2,2.00,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3,3.47,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,4,6.17,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2077959,2077959,0.88,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2077960,2077960,3.12,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2077961,2077961,5.73,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2077962,2077962,5.03,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [76]:
df_test_pca = dec.transform(X_test_2)

In [77]:
y_test_kaggle_dt = dt.predict(df_test_pca)

In [82]:
print(round(matthews_corrcoef( predict_np,y_test_kaggle_dt),5))

0.01521


In [81]:
predict_np = np.array(predict['0'])

## Бустинг

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

In [10]:
gb = GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.1, max_depth = 10, loss = 'exponential', verbose = 1)

In [11]:
gb.fit(Xtrain, ytrain)

      Iter       Train Loss   Remaining Time 
         1           0.9404          122.60m
         2           0.8926          118.63m
         3           0.8411          117.80m
         4           0.7957          116.84m
         5           0.7587          116.05m
         6           0.7246          115.43m
         7           0.6899          114.70m
         8           0.6603          113.99m
         9           0.6239          113.27m
        10           0.5914          112.53m
        20           0.4079          106.33m
        30           0.3232          100.31m
        40           0.2760           94.97m
        50           0.2522           89.46m
        60           0.2317           83.76m
        70           0.2187           78.05m
        80           0.2070           72.20m
        90           0.1975           66.23m
       100           0.1923           60.32m
       200           0.1652            0.00s


In [12]:
import pickle

In [9]:
from joblib import dump, load

In [19]:
dump(gb, 'gb_model.joblib')

['gb_model.joblib']

In [11]:
gb = load('gb_model.joblib')

In [13]:
gb_model = pickle.dumps(gb)

In [12]:
pred_gb = gb.predict(Xtest)

In [13]:
print(f"Decision tree accuracy train: {accuracy_score(ytest,pred_gb):.8f}")

Decision tree accuracy train: 0.97942217


In [14]:
print(f"Decision tree matthews corrcoef: {matthews_corrcoef(pred_gb,ytest):.8f}")

Decision tree matthews corrcoef: 0.95848870


In [15]:
pred_train_gb = gb.predict(Xtrain)

In [16]:
print(f"Decision tree accuracy train: {accuracy_score(ytrain,pred_train_gb):.8f}")

Decision tree accuracy train: 0.98076688


In [17]:
print(f"Decision tree matthews corrcoef: {matthews_corrcoef(ytrain,pred_train_gb):.8f}")

Decision tree matthews corrcoef: 0.96120493


In [28]:
df_test = pd.read_csv('df_copy_test_.csv')

In [19]:
X_test_2 = encoder.fit_transform(df_test)

In [20]:
pred_test_kaggle_gb = gb.predict(X_test_2)

In [30]:
print(f"Decision tree accuracy train: {accuracy_score(pred_test_kaggle_gb,df_predict_kaggle_np):.8f}")

Decision tree accuracy train: 0.11051058


In [32]:
print(f"Decision tree matthews corrcoef: {matthews_corrcoef(df_predict_kaggle_np,pred_test_kaggle_gb):.8f}")

Decision tree matthews corrcoef: -0.05378312


In [None]:
X_train_2 = pd.read_csv('X_train_2.csv')

In [40]:
df_predict_kaggle = pd.read_csv('predict_kaggle.csv')

In [41]:
df_predict_kaggle_0 = df_predict_kaggle['0']

In [34]:
df_predict_kaggle_0 = df_predict_kaggle_0.replace('p',1)

In [35]:
df_predict_kaggle_0 = df_predict_kaggle_0.replace('e',-1)

In [42]:
df_predict_kaggle_np = np.array(df_predict_kaggle_0)

In [29]:
df_predict_kaggle_np

array([-1,  1,  1, ...,  1, -1, -1])

In [33]:
def cleaning(df):

    threshold = 100

    cat_feats = ["cap-shape","cap-surface","cap-color","does-bruise-or-bleed","gill-attachment",
                 "gill-spacing","gill-color","stem-root","stem-surface","stem-color","veil-type",
                 "veil-color","has-ring","ring-type","spore-print-color","spore-print-color",
                 "habitat","season"]

    for feat in cat_feats:
        df[feat] = df[feat].fillna('missing')
        #df.loc[df[feat].value_counts(dropna=False)[df[feat]].values < threshold, feat] = "noise"
        df[feat] = df[feat].astype('category')

    return df

In [38]:
dataset = cleaning(df_train)

In [10]:
df_train = pd.read_csv('train.csv')

In [11]:
X = dataset
X = X.drop(["class"], axis="columns")
y = dataset["class"]

NameError: name 'dataset' is not defined

In [16]:
gb_new = GradientBoostingClassifier(n_estimators = 1000, learning_rate = 0.1, max_depth = 10, loss = 'exponential', verbose = 1)

In [42]:
df_test

Unnamed: 0.1,Unnamed: 0,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,habitat,season
0,0,8.64,x,n,t,w,11.13,17.12,w,t,d,a
1,1,6.90,o,o,f,y,1.27,10.75,n,f,d,a
2,2,2.00,b,n,f,n,6.18,3.14,n,f,d,s
3,3,3.47,x,n,f,n,4.98,8.51,w,t,d,u
4,4,6.17,x,y,f,y,6.73,13.70,y,t,d,u
...,...,...,...,...,...,...,...,...,...,...,...,...
2077959,2077959,0.88,x,w,f,w,2.67,1.35,e,f,d,u
2077960,2077960,3.12,x,w,f,w,2.69,7.38,w,f,g,a
2077961,2077961,5.73,x,e,f,w,6.16,9.74,y,t,d,a
2077962,2077962,5.03,b,n,f,g,6.00,3.46,g,f,d,a


In [13]:
X_train = df_train[['cap-diameter', 'stem-height', 'stem-width']]

In [20]:
X_train['cap-diameter']=X_train['cap-diameter'].fillna(X_train['cap-diameter'].mean())


In [14]:
y_train = df_train['class']

In [21]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [22]:
gb_new.fit(Xtrain, ytrain)

      Iter       Train Loss   Remaining Time 
         1           0.9629          113.40m
         2           0.9337          111.76m
         3           0.9076          110.62m
         4           0.8846          109.68m
         5           0.8642          109.79m
         6           0.8468          110.16m
         7           0.8304          109.80m
         8           0.8158          109.47m
         9           0.8036          109.47m
        10           0.7901          109.20m
        20           0.7073          108.17m
        30           0.6701          106.94m
        40           0.6547          105.92m
        50           0.6473          105.41m
        60           0.6433          104.42m
        70           0.6396          103.78m
        80           0.6371          102.65m
        90           0.6351          101.65m
       100           0.6333          100.72m
       200           0.6173           90.14m
       300           0.6062           78.78m
       40

In [25]:
pred_gb = gb_new.predict(Xtest)

In [26]:
print(f"Decision tree accuracy train: {accuracy_score(ytest,pred_gb):.8f}")

Decision tree accuracy train: 0.82212935


In [19]:
df_copy_test_[i]=df_test[i].fillna(df_test[i].mean())


cap-diameter    4
stem-height     0
stem-width      0
dtype: int64

In [27]:
print(f"Decision tree matthews corrcoef: {matthews_corrcoef(ytest,pred_gb):.8f}")

Decision tree matthews corrcoef: 0.64181242


In [30]:
df_test_3 = df_test[['cap-diameter', 'stem-height', 'stem-width']]

In [31]:
pred_test_gb = gb_new.predict(df_test_3)

In [43]:
print(f"Decision tree accuracy train: {accuracy_score(df_predict_kaggle_np,pred_test_gb):.8f}")

Decision tree accuracy train: 0.82379339


In [44]:

print(f"Decision tree matthews corrcoef: {matthews_corrcoef(df_predict_kaggle_np,pred_test_gb):.8f}")

Decision tree matthews corrcoef: 0.64518921
