In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv('data/train/train.csv', header = 0)

**Предобработка**

In [4]:
df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [11]:
df['class'].value_counts()

1    199710
2      2207
Name: class, dtype: int64

In [6]:
df.target.describe()

count    201917.000000
mean         -0.393636
std           3.850500
min         -33.219281
25%          -0.883110
50%          -0.023437
75%           0.765453
max          17.965068
Name: target, dtype: float64

In [7]:
df.first_active_month = pd.to_datetime(df.first_active_month)
df['yr'] = df.first_active_month.dt.year
df['mnth'] = df.first_active_month.dt.month

In [8]:
df['class'] = [1] * df.shape[0]

In [9]:
def get_class(x):
    if x < -30:
        return 2
    else:
        return 1

In [10]:
df.loc[:, 'class'] = df.loc[:, 'target'].apply(lambda val: get_class(val))

**Обучение и оценка**

In [12]:
from sklearn import tree
import numpy as np
from sklearn.model_selection import train_test_split

In [13]:
model = tree.DecisionTreeClassifier()

In [14]:
X = np.array(df.loc[:, ['feature_1', 'feature_2', 'feature_3', 'mnth'] ])

In [15]:
y = np.array(df['class'])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)

In [17]:
model = model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)

In [19]:
accuracy_score(y_test, y_pred) # хорошее качество

0.9887909403063919

In [20]:
accuracy_score(y_train, model.predict(X_train))

0.989189265676626

In [21]:
from sklearn.model_selection import cross_val_score

In [22]:
cross_val_score(model, X, y, cv=10)

array([0.98905507, 0.98905507, 0.98905507, 0.98905507, 0.98905507,
       0.98905507, 0.98905507, 0.98910406, 0.98910406, 0.98910406])

Отчет

In [23]:
target_names = ['normal', 'odd']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

      normal       0.99      1.00      0.99     59897
         odd       0.00      0.00      0.00       679

   micro avg       0.99      0.99      0.99     60576
   macro avg       0.49      0.50      0.50     60576
weighted avg       0.98      0.99      0.98     60576



  'precision', 'predicted', average, warn_for)


**Классификация**

In [24]:
df_test = pd.read_csv('data/test/test.csv', header = 0)
df_test = df_test[df_test.first_active_month.notnull()]

In [25]:
df_test.first_active_month = pd.to_datetime(df_test.first_active_month)
df_test['yr'] = df_test.first_active_month.dt.year
df_test['mnth'] = df_test.first_active_month.dt.month

In [26]:
df_test.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,yr,mnth
0,2017-04-01,C_ID_0ab67a22ab,3,3,1,2017,4
1,2017-01-01,C_ID_130fd0cbdd,2,3,0,2017,1
2,2017-08-01,C_ID_b709037bc5,5,1,1,2017,8
3,2017-12-01,C_ID_d27d835a9f,2,1,0,2017,12
4,2015-12-01,C_ID_2b5e3df5c2,5,1,1,2015,12


In [27]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 123622 entries, 0 to 123622
Data columns (total 7 columns):
first_active_month    123622 non-null datetime64[ns]
card_id               123622 non-null object
feature_1             123622 non-null int64
feature_2             123622 non-null int64
feature_3             123622 non-null int64
yr                    123622 non-null int64
mnth                  123622 non-null int64
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 7.5+ MB


In [28]:
X_new = np.array(df_test.loc[:, ['feature_1', 'feature_2', 'feature_3', 'mnth'] ])

In [29]:
y_new_pred = model.predict(X_new)

In [30]:
df_test['class'] = y_new_pred

In [31]:
df_test.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,yr,mnth,class
0,2017-04-01,C_ID_0ab67a22ab,3,3,1,2017,4,1
1,2017-01-01,C_ID_130fd0cbdd,2,3,0,2017,1,1
2,2017-08-01,C_ID_b709037bc5,5,1,1,2017,8,1
3,2017-12-01,C_ID_d27d835a9f,2,1,0,2017,12,1
4,2015-12-01,C_ID_2b5e3df5c2,5,1,1,2015,12,1


In [32]:
df_test['class'].value_counts()

1    123622
Name: class, dtype: int64

Примерно так будет выглядеть классификатор, но на расширенном пространстве признаков. В таком варианте, очевидно плох, из-за низких показателей полноты и точности плохие, не смотря на высокий accuracy_score. 

In [33]:
target_names = ['normal', 'odd']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

      normal       0.99      1.00      0.99     59897
         odd       0.00      0.00      0.00       679

   micro avg       0.99      0.99      0.99     60576
   macro avg       0.49      0.50      0.50     60576
weighted avg       0.98      0.99      0.98     60576



  'precision', 'predicted', average, warn_for)
