In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [35]:
df = pd.read_csv("data.csv")

In [37]:
df

Unnamed: 0,date,year,mounth,day,weekday,temp,pres,prec,wind,absenteeism
0,20120114,2012,1,14,6,-21,756,0,1.0,4
1,20120116,2012,1,16,1,-19,758,0,2.0,3
2,20120117,2012,1,17,2,-27,770,0,1.0,3
3,20120118,2012,1,18,3,-28,776,0,2.0,6
4,20120119,2012,1,19,4,-27,778,0,2.0,4
...,...,...,...,...,...,...,...,...,...,...
1689,20181225,2018,12,25,2,-23,770,0,1.0,2
1690,20181226,2018,12,26,3,-22,770,0,,1
1691,20181227,2018,12,27,4,-23,770,0,,1
1692,20181228,2018,12,28,5,-25,766,0,,0


In [38]:
X = df.drop(["date", "absenteeism"], axis =1)
X["prec"].value_counts()

0    1457
2     174
1      57
3       6
Name: prec, dtype: int64

In [39]:
X = X.apply(pd.to_numeric).fillna(0).astype("int")

In [40]:
y = df["absenteeism"].astype("int")

In [41]:
y

0       4
1       3
2       3
3       6
4       4
       ..
1689    2
1690    1
1691    1
1692    0
1693    0
Name: absenteeism, Length: 1694, dtype: int64

In [42]:
y.value_counts()

3     479
2     354
4     276
1     206
0     158
5     114
6      58
7      24
8      12
9       9
10      4
Name: absenteeism, dtype: int64

In [43]:
X

Unnamed: 0,year,mounth,day,weekday,temp,pres,prec,wind
0,2012,1,14,6,-21,756,0,1
1,2012,1,16,1,-19,758,0,2
2,2012,1,17,2,-27,770,0,1
3,2012,1,18,3,-28,776,0,2
4,2012,1,19,4,-27,778,0,2
...,...,...,...,...,...,...,...,...
1689,2018,12,25,2,-23,770,0,1
1690,2018,12,26,3,-22,770,0,0
1691,2018,12,27,4,-23,770,0,0
1692,2018,12,28,5,-25,766,0,0


In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Дерево

In [45]:
kfold = StratifiedKFold(n_splits=3) # задаем объект для разделения данных на фолды при кросс-валидации

model = DecisionTreeClassifier()

params = {
          'max_depth': range(1,50,2)
         }  # перебираемые параметры модели

grid = GridSearchCV(model, params, cv=kfold, scoring='accuracy', n_jobs=-1)

In [46]:
grid.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
       

In [47]:
mod = grid.best_estimator_
mod

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [48]:
mod = grid.best_estimator_
y_pred = mod.predict(X_test)
accuracy_score(y_test, y_pred)

0.3713163064833006

In [49]:
y_test.value_counts()

3     144
2     106
4      83
1      62
0      48
5      34
6      17
7       7
8       4
9       3
10      1
Name: absenteeism, dtype: int64

In [50]:
delta = [[i[0], int(i[1]), int(i[1]) - i[0]] for i in list(zip(y_test, y_pred))]
delta

[[1, 1, 0],
 [1, 2, 1],
 [0, 2, 2],
 [1, 3, 2],
 [4, 3, -1],
 [5, 3, -2],
 [2, 3, 1],
 [2, 2, 0],
 [5, 4, -1],
 [0, 0, 0],
 [3, 3, 0],
 [4, 3, -1],
 [4, 3, -1],
 [3, 2, -1],
 [1, 0, -1],
 [3, 1, -2],
 [2, 2, 0],
 [3, 4, 1],
 [4, 4, 0],
 [5, 3, -2],
 [3, 3, 0],
 [1, 3, 2],
 [2, 2, 0],
 [4, 4, 0],
 [1, 2, 1],
 [3, 2, -1],
 [4, 3, -1],
 [1, 3, 2],
 [2, 3, 1],
 [4, 4, 0],
 [3, 2, -1],
 [0, 1, 1],
 [4, 3, -1],
 [1, 1, 0],
 [3, 3, 0],
 [0, 3, 3],
 [2, 3, 1],
 [2, 3, 1],
 [5, 3, -2],
 [8, 5, -3],
 [4, 3, -1],
 [2, 3, 1],
 [5, 3, -2],
 [2, 3, 1],
 [10, 4, -6],
 [4, 4, 0],
 [2, 1, -1],
 [1, 1, 0],
 [2, 3, 1],
 [6, 4, -2],
 [0, 2, 2],
 [4, 3, -1],
 [4, 3, -1],
 [3, 3, 0],
 [1, 2, 1],
 [1, 3, 2],
 [0, 2, 2],
 [2, 2, 0],
 [3, 3, 0],
 [5, 3, -2],
 [3, 1, -2],
 [2, 2, 0],
 [3, 3, 0],
 [2, 1, -1],
 [3, 3, 0],
 [6, 4, -2],
 [2, 2, 0],
 [4, 3, -1],
 [3, 3, 0],
 [4, 6, 2],
 [3, 3, 0],
 [2, 1, -1],
 [3, 1, -2],
 [3, 2, -1],
 [3, 3, 0],
 [1, 3, 2],
 [4, 3, -1],
 [1, 1, 0],
 [2, 2, 0],
 [0, 2, 2],
 [0, 0, 

# Линейная регрессия

In [51]:
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [52]:
LR = LinearRegression()
model = LR.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [53]:
linear_scoring = model_selection.cross_val_score(model, X_train, y_train, scoring="neg_mean_absolute_error", cv=10)
print(f"{linear_scoring.mean()}, {linear_scoring.std()}")

-1.0477656550682295, 0.05476527583287373


In [54]:
X_train

Unnamed: 0,year,mounth,day,weekday,temp,pres,prec,wind
1171,2017,5,23,2,25,742,0,4
1283,2016,12,24,6,-27,762,0,5
556,2013,9,6,5,16,755,0,2
1639,2018,10,22,1,2,742,0,3
1621,2018,10,1,1,20,743,0,2
...,...,...,...,...,...,...,...,...
450,2014,2,10,1,-18,767,0,1
1079,2017,1,23,1,-2,743,2,7
1375,2018,5,8,2,5,740,0,1
545,2014,6,4,3,14,750,0,1


In [55]:
print('\nIntercept: \n', model.intercept_)
print('\nScore test: \n', model.score(X_test, y_test))
print('\nScore train: \n', model.score(X_train, y_train))
print('\nMean absolute error: \n', metrics.mean_absolute_error(y_test, y_pred))
print('\nMedian absolute error: \n', metrics.median_absolute_error(y_test, y_pred), "\n")
print(*delta, sep="\n")


Intercept: 
 -546.4313632384394

Score test: 
 0.3803433380012685

Score train: 
 0.3737234838743829

Mean absolute error: 
 1.0523879211190956

Median absolute error: 
 0.8883544369018637 

[1, 1, 0]
[1, 2, 1]
[0, 2, 2]
[1, 3, 2]
[4, 3, -1]
[5, 3, -2]
[2, 3, 1]
[2, 2, 0]
[5, 4, -1]
[0, 0, 0]
[3, 3, 0]
[4, 3, -1]
[4, 3, -1]
[3, 2, -1]
[1, 0, -1]
[3, 1, -2]
[2, 2, 0]
[3, 4, 1]
[4, 4, 0]
[5, 3, -2]
[3, 3, 0]
[1, 3, 2]
[2, 2, 0]
[4, 4, 0]
[1, 2, 1]
[3, 2, -1]
[4, 3, -1]
[1, 3, 2]
[2, 3, 1]
[4, 4, 0]
[3, 2, -1]
[0, 1, 1]
[4, 3, -1]
[1, 1, 0]
[3, 3, 0]
[0, 3, 3]
[2, 3, 1]
[2, 3, 1]
[5, 3, -2]
[8, 5, -3]
[4, 3, -1]
[2, 3, 1]
[5, 3, -2]
[2, 3, 1]
[10, 4, -6]
[4, 4, 0]
[2, 1, -1]
[1, 1, 0]
[2, 3, 1]
[6, 4, -2]
[0, 2, 2]
[4, 3, -1]
[4, 3, -1]
[3, 3, 0]
[1, 2, 1]
[1, 3, 2]
[0, 2, 2]
[2, 2, 0]
[3, 3, 0]
[5, 3, -2]
[3, 1, -2]
[2, 2, 0]
[3, 3, 0]
[2, 1, -1]
[3, 3, 0]
[6, 4, -2]
[2, 2, 0]
[4, 3, -1]
[3, 3, 0]
[4, 6, 2]
[3, 3, 0]
[2, 1, -1]
[3, 1, -2]
[3, 2, -1]
[3, 3, 0]
[1, 3, 2]
[4, 3, -1]
[1, 1,

In [56]:
model.predict([[2018, 12, 6, 5, -21, 770, 0, 1]])

array([4.76232799])

In [57]:
df = df[["date", "year", "mounth", "day", "weekday", "temp", "pres", "prec", "wind", "absenteeism"]]

In [58]:
df

Unnamed: 0,date,year,mounth,day,weekday,temp,pres,prec,wind,absenteeism
0,20120114,2012,1,14,6,-21,756,0,1.0,4
1,20120116,2012,1,16,1,-19,758,0,2.0,3
2,20120117,2012,1,17,2,-27,770,0,1.0,3
3,20120118,2012,1,18,3,-28,776,0,2.0,6
4,20120119,2012,1,19,4,-27,778,0,2.0,4
...,...,...,...,...,...,...,...,...,...,...
1689,20181225,2018,12,25,2,-23,770,0,1.0,2
1690,20181226,2018,12,26,3,-22,770,0,,1
1691,20181227,2018,12,27,4,-23,770,0,,1
1692,20181228,2018,12,28,5,-25,766,0,,0


# Нейронка

In [61]:
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


ModuleNotFoundError: No module named 'tensorflow'