In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
import sklearn


In [21]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [22]:
y = df["class"]
y = y.map({"p":-1, "e":1})

In [23]:
X = df.drop(["class"], axis=1)

In [24]:
X.isnull().sum()

id                            0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-root               2757023
stem-surface            1980861
stem-color                   38
veil-type               2957493
veil-color              2740947
has-ring                     24
ring-type                128880
spore-print-color       2849682
habitat                      45
season                        0
dtype: int64

In [8]:
X.shape

(3116945, 21)

In [9]:
y.value_counts()

class
-1    1705396
 1    1411549
Name: count, dtype: int64

In [25]:
X = X.drop(["stem-root", "veil-type", "veil-color", "spore-print-color"], axis=1)

In [11]:
X.shape

(3116945, 17)

In [12]:
X.head()

Unnamed: 0,id,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,0,8.8,f,s,u,f,a,c,w,4.51,15.39,,w,f,f,d,a
1,1,4.51,x,h,o,f,a,c,n,4.79,6.48,y,o,t,z,d,w
2,2,6.94,f,s,b,f,x,c,w,6.85,9.93,s,n,f,f,l,w
3,3,3.88,f,y,g,f,s,,g,4.16,6.53,,w,f,f,d,u
4,4,5.85,x,l,w,f,d,,w,3.37,8.36,,w,f,f,g,a


In [26]:
X = X.drop(["gill-spacing", "stem-surface"], axis=1)


In [14]:
X.head()

Unnamed: 0,id,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,0,8.8,f,s,u,f,a,w,4.51,15.39,w,f,f,d,a
1,1,4.51,x,h,o,f,a,n,4.79,6.48,o,t,z,d,w
2,2,6.94,f,s,b,f,x,w,6.85,9.93,n,f,f,l,w
3,3,3.88,f,y,g,f,s,g,4.16,6.53,w,f,f,d,u
4,4,5.85,x,l,w,f,d,w,3.37,8.36,w,f,f,g,a


### Выделение и преобразование категориальных признаков

In [27]:
cat_cols = ["cap-shape","cap-surface",  "cap-color", "does-bruise-or-bleed", "gill-attachment","gill-color", "stem-color", "has-ring", "ring-type", "habitat", "season"]

In [28]:
encoder = LeaveOneOutEncoder(return_df=True)
X1 = encoder.fit_transform(X,y)
X1.dtypes


id                        int64
cap-diameter            float64
cap-shape               float64
cap-surface             float64
cap-color               float64
does-bruise-or-bleed    float64
gill-attachment         float64
gill-color              float64
stem-height             float64
stem-width              float64
stem-color              float64
has-ring                float64
ring-type               float64
habitat                 float64
season                  float64
dtype: object

In [29]:
X1.head()

Unnamed: 0,id,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,0,8.8,-0.02389,0.261312,-0.078651,-0.111527,-0.292689,0.143119,4.51,15.39,0.164571,-0.066509,-0.093477,-0.057678,-0.143173
1,1,4.51,-0.020332,0.059745,-0.434519,-0.111526,-0.292686,-0.429756,4.79,6.48,-0.193312,-0.182206,-0.992318,-0.057677,0.307986
2,2,6.94,-0.02389,0.261312,0.747912,-0.111527,-0.076904,0.143119,6.85,9.93,-0.221857,-0.066509,-0.093477,0.210151,0.307979
3,3,3.88,-0.02389,0.076031,0.176156,-0.111527,-0.178318,0.038004,4.16,6.53,0.164571,-0.066509,-0.093477,-0.057678,-0.164923
4,4,5.85,-0.020333,0.14236,0.00588,-0.111527,-0.142756,0.143119,3.37,8.36,0.164571,-0.066509,-0.093477,-0.349208,-0.143173


In [30]:
X1.isnull().sum()

id                      0
cap-diameter            4
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-color              0
stem-height             0
stem-width              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64

In [31]:
X1.fillna(X1.mean(), inplace=True) # избавимся от пропусков в cap-diameter

In [32]:
X1.isnull().sum()

id                      0
cap-diameter            0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-color              0
stem-height             0
stem-width              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64

**Разделим выборку**

In [33]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X1, y, test_size=0.3, random_state=42)

**Обучим линейную модель**

In [39]:
model = LogisticRegression()
model.fit(Xtrain, ytrain)

#y_train = model.predict(Xtrain)
y_pred = model.predict(Xtest)


print(accuracy_score(ytest, y_pred))

0.6228360232877475


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
