In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [98]:
df = pd.read_csv("temp.csv", sep=";").drop(["Unnamed: 0"], axis=1)

In [99]:
df

Unnamed: 0,date,year,mounth,day,weekday,temp,pres,prec,wind,absenteeism,...,1,2,3,4,5,6,7,8,9,10
0,20120114,2012,1,14,6,-21,756,0,1.0,4,...,0,0,0,1,0,0,0,0,0,0
1,20120116,2012,1,16,1,-19,758,0,2.0,3,...,0,0,1,0,0,0,0,0,0,0
2,20120117,2012,1,17,2,-27,770,0,1.0,3,...,0,0,1,0,0,0,0,0,0,0
3,20120118,2012,1,18,3,-28,776,0,2.0,6,...,0,0,0,0,0,1,0,0,0,0
4,20120119,2012,1,19,4,-27,778,0,2.0,4,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1689,20181225,2018,12,25,2,-23,770,0,1.0,2,...,0,1,0,0,0,0,0,0,0,0
1690,20181226,2018,12,26,3,-22,770,0,,1,...,1,0,0,0,0,0,0,0,0,0
1691,20181227,2018,12,27,4,-23,770,0,,1,...,1,0,0,0,0,0,0,0,0,0
1692,20181228,2018,12,28,5,-25,766,0,,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
X = df[["year", "mounth", "day", "weekday", "temp", "pres", "prec", "wind"]]
X["prec"].value_counts()

0    1457
2     174
1      57
3       6
Name: prec, dtype: int64

In [101]:
sorted(df.absenteeism.unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [102]:
X = X.apply(pd.to_numeric).fillna(0).astype("int")

In [103]:
y = df["absenteeism"].astype("int")

In [104]:
y

0       4
1       3
2       3
3       6
4       4
       ..
1689    2
1690    1
1691    1
1692    0
1693    0
Name: absenteeism, Length: 1694, dtype: int32

In [105]:
y.value_counts()

3     479
2     354
4     276
1     206
0     158
5     114
6      58
7      24
8      12
9       9
10      4
Name: absenteeism, dtype: int64

In [106]:
X

Unnamed: 0,year,mounth,day,weekday,temp,pres,prec,wind
0,2012,1,14,6,-21,756,0,1
1,2012,1,16,1,-19,758,0,2
2,2012,1,17,2,-27,770,0,1
3,2012,1,18,3,-28,776,0,2
4,2012,1,19,4,-27,778,0,2
...,...,...,...,...,...,...,...,...
1689,2018,12,25,2,-23,770,0,1
1690,2018,12,26,3,-22,770,0,0
1691,2018,12,27,4,-23,770,0,0
1692,2018,12,28,5,-25,766,0,0


In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Дерево

In [108]:
kfold = StratifiedKFold(n_splits=3) # задаем объект для разделения данных на фолды при кросс-валидации

model = DecisionTreeClassifier()

params = {
          'max_depth': range(1,50,2)
         }  # перебираемые параметры модели

grid = GridSearchCV(model, params, cv=kfold, scoring='accuracy', n_jobs=-1)

In [109]:
grid.fit(X_train, y_train)



GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1, param_grid={'max_depth': range(1, 50, 2)},
             pre_dispa

In [110]:
mod = grid.best_estimator_
mod

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [111]:
mod = grid.best_estimator_
y_pred = mod.predict(X_test)
accuracy_score(y_test, y_pred)

0.3850687622789784

In [112]:
y_test.value_counts()

3     144
2     106
4      83
1      62
0      48
5      34
6      17
7       7
8       4
9       3
10      1
Name: absenteeism, dtype: int64

In [113]:
delta = [[i[0], int(i[1]), int(i[1]) - i[0]] for i in list(zip(y_test, y_pred))]
delta

[[3, 3, 0],
 [4, 4, 0],
 [0, 0, 0],
 [4, 3, -1],
 [3, 2, -1],
 [2, 3, 1],
 [4, 3, -1],
 [3, 4, 1],
 [3, 3, 0],
 [1, 1, 0],
 [0, 0, 0],
 [5, 4, -1],
 [4, 3, -1],
 [3, 3, 0],
 [3, 2, -1],
 [4, 2, -2],
 [3, 3, 0],
 [4, 3, -1],
 [2, 3, 1],
 [5, 3, -2],
 [2, 4, 2],
 [2, 3, 1],
 [4, 3, -1],
 [3, 2, -1],
 [1, 4, 3],
 [6, 3, -3],
 [1, 1, 0],
 [2, 2, 0],
 [2, 1, -1],
 [3, 3, 0],
 [0, 2, 2],
 [3, 4, 1],
 [1, 1, 0],
 [0, 0, 0],
 [3, 3, 0],
 [6, 3, -3],
 [2, 2, 0],
 [3, 3, 0],
 [3, 3, 0],
 [3, 0, -3],
 [0, 2, 2],
 [4, 3, -1],
 [1, 4, 3],
 [1, 0, -1],
 [4, 3, -1],
 [3, 3, 0],
 [3, 2, -1],
 [1, 1, 0],
 [4, 3, -1],
 [0, 3, 3],
 [1, 1, 0],
 [4, 4, 0],
 [4, 3, -1],
 [1, 3, 2],
 [1, 1, 0],
 [3, 4, 1],
 [4, 3, -1],
 [2, 1, -1],
 [3, 3, 0],
 [2, 3, 1],
 [4, 4, 0],
 [0, 2, 2],
 [1, 1, 0],
 [8, 3, -5],
 [3, 2, -1],
 [0, 0, 0],
 [2, 2, 0],
 [4, 4, 0],
 [2, 3, 1],
 [5, 4, -1],
 [0, 0, 0],
 [9, 7, -2],
 [3, 1, -2],
 [3, 3, 0],
 [2, 2, 0],
 [2, 1, -1],
 [4, 3, -1],
 [1, 4, 3],
 [3, 4, 1],
 [3, 3, 0],
 [3, 4, 1]

# Линейная регрессия

In [114]:
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [115]:
LR = LinearRegression()
model = LR.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [116]:
linear_scoring = model_selection.cross_val_score(model, X_train, y_train, scoring="neg_mean_absolute_error", cv=10)
print(f"{linear_scoring.mean()}, {linear_scoring.std()}")

-1.048227175607445, 0.08225230425046402


In [117]:
X_train

Unnamed: 0,year,mounth,day,weekday,temp,pres,prec,wind
347,2012,9,22,6,10,751,0,1
963,2016,5,25,3,22,750,0,2
1557,2019,4,12,5,10,756,0,5
1511,2019,2,1,5,-32,764,0,2
328,2013,6,8,6,21,750,0,5
...,...,...,...,...,...,...,...,...
934,2016,4,19,2,16,750,0,4
1637,2018,10,19,5,6,746,0,1
587,2013,10,12,6,-2,766,0,1
194,2011,12,27,2,-5,751,2,9


In [118]:
print('\nIntercept: \n', model.intercept_)
print('\nScore test: \n', model.score(X_test, y_test))
print('\nScore train: \n', model.score(X_train, y_train))
print('\nMean absolute error: \n', metrics.mean_absolute_error(y_test, y_pred))
print('\nMedian absolute error: \n', metrics.median_absolute_error(y_test, y_pred), "\n")
print(*delta, sep="\n")


Intercept: 
 -557.0105773569918

Score test: 
 0.3782800863658222

Score train: 
 0.3747395225529191

Mean absolute error: 
 1.0619777966918487

Median absolute error: 
 0.880700201593072 

[3, 3, 0]
[4, 4, 0]
[0, 0, 0]
[4, 3, -1]
[3, 2, -1]
[2, 3, 1]
[4, 3, -1]
[3, 4, 1]
[3, 3, 0]
[1, 1, 0]
[0, 0, 0]
[5, 4, -1]
[4, 3, -1]
[3, 3, 0]
[3, 2, -1]
[4, 2, -2]
[3, 3, 0]
[4, 3, -1]
[2, 3, 1]
[5, 3, -2]
[2, 4, 2]
[2, 3, 1]
[4, 3, -1]
[3, 2, -1]
[1, 4, 3]
[6, 3, -3]
[1, 1, 0]
[2, 2, 0]
[2, 1, -1]
[3, 3, 0]
[0, 2, 2]
[3, 4, 1]
[1, 1, 0]
[0, 0, 0]
[3, 3, 0]
[6, 3, -3]
[2, 2, 0]
[3, 3, 0]
[3, 3, 0]
[3, 0, -3]
[0, 2, 2]
[4, 3, -1]
[1, 4, 3]
[1, 0, -1]
[4, 3, -1]
[3, 3, 0]
[3, 2, -1]
[1, 1, 0]
[4, 3, -1]
[0, 3, 3]
[1, 1, 0]
[4, 4, 0]
[4, 3, -1]
[1, 3, 2]
[1, 1, 0]
[3, 4, 1]
[4, 3, -1]
[2, 1, -1]
[3, 3, 0]
[2, 3, 1]
[4, 4, 0]
[0, 2, 2]
[1, 1, 0]
[8, 3, -5]
[3, 2, -1]
[0, 0, 0]
[2, 2, 0]
[4, 4, 0]
[2, 3, 1]
[5, 4, -1]
[0, 0, 0]
[9, 7, -2]
[3, 1, -2]
[3, 3, 0]
[2, 2, 0]
[2, 1, -1]
[4, 3, -1]
[1, 4, 3]

In [119]:
model.predict([[2018, 12, 6, 5, -21, 770, 0, 1]])

array([4.71477372])

# Нейронка

In [120]:
from keras.models import Sequential
from keras.layers import Dense

In [145]:
y = df[["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [122]:
model = Sequential()

In [123]:
model.add(Dense(12, input_dim=8, activation='relu'))

In [124]:
model.add(Dense(15, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(11, activation='sigmoid'))

In [127]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])

In [146]:
model.fit(X_train, y_train, epochs = 100, batch_size=20)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x27b86421348>

In [147]:
y_pred = model.predict(X_test, batch_size=20)

In [158]:
np.around(y_pred.astype("float"), decimals=3).tolist()

[[0.203, 0.25, 0.261, 0.283, 0.097, 0.023, 0.006, 0.001, 0.002, 0.0, 0.0],
 [0.059, 0.116, 0.246, 0.333, 0.197, 0.063, 0.03, 0.008, 0.008, 0.0, 0.0],
 [0.253, 0.285, 0.262, 0.265, 0.075, 0.016, 0.004, 0.0, 0.002, 0.0, 0.0],
 [0.087, 0.148, 0.252, 0.324, 0.173, 0.051, 0.022, 0.005, 0.007, 0.0, 0.0],
 [0.091, 0.154, 0.255, 0.331, 0.183, 0.055, 0.025, 0.006, 0.008, 0.0, 0.0],
 [0.119, 0.181, 0.257, 0.315, 0.151, 0.042, 0.016, 0.003, 0.005, 0.0, 0.0],
 [0.019, 0.054, 0.221, 0.328, 0.219, 0.081, 0.042, 0.014, 0.008, 0.0, 0.0],
 [0.13, 0.191, 0.259, 0.314, 0.147, 0.04, 0.015, 0.003, 0.005, 0.0, 0.0],
 [0.053, 0.109, 0.243, 0.33, 0.196, 0.063, 0.03, 0.008, 0.008, 0.0, 0.0],
 [0.236, 0.271, 0.219, 0.255, 0.091, 0.032, 0.01, 0.001, 0.004, 0.0, 0.0],
 [0.174, 0.227, 0.236, 0.29, 0.128, 0.042, 0.016, 0.002, 0.006, 0.0, 0.0],
 [0.039, 0.09, 0.239, 0.339, 0.219, 0.076, 0.04, 0.013, 0.01, 0.0, 0.0],
 [0.001, 0.006, 0.162, 0.313, 0.283, 0.154, 0.1, 0.054, 0.006, 0.008, 0.001],
 [0.064, 0.122, 0.247, 