In [152]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [153]:
df = pd.read_csv("data.csv", sep = ";")

In [154]:
df["absenteeism"] = df["percent.1"].apply(lambda x: round(float(x.replace(",", "."))*2/10)/2*10)

In [155]:
df['prec'] = df['prec'].apply({'snow':2, 'rain':1, "storm":3,np.nan : 0}.get) 

In [156]:
df = df.drop(["Unnamed: 0", "percent.1", "percent","percentNormalized"], axis = 1)

In [157]:
df

Unnamed: 0,date,year,mounth,day,absenteeism,weekday,temp,pres,prec,wind
0,20120114,2012,1,14,20.0,6,-21,756,0,1.0
1,20120116,2012,1,16,15.0,1,-19,758,0,2.0
2,20120117,2012,1,17,15.0,2,-27,770,0,1.0
3,20120118,2012,1,18,30.0,3,-28,776,0,2.0
4,20120119,2012,1,19,20.0,4,-27,778,0,2.0
...,...,...,...,...,...,...,...,...,...,...
1689,20181225,2018,12,25,10.0,2,-23,770,0,1.0
1690,20181226,2018,12,26,5.0,3,-22,770,0,
1691,20181227,2018,12,27,5.0,4,-23,770,0,
1692,20181228,2018,12,28,0.0,5,-25,766,0,


In [183]:
X = df.drop(["date", "absenteeism"], axis =1)
X["prec"].value_counts()

0    1457
2     174
1      57
3       6
Name: prec, dtype: int64

In [184]:
X = X.apply(pd.to_numeric).fillna(0).astype("int")

In [185]:
y = df["absenteeism"].astype("int")

In [186]:
y

0       20
1       15
2       15
3       30
4       20
        ..
1689    10
1690     5
1691     5
1692     0
1693     0
Name: absenteeism, Length: 1694, dtype: int64

In [187]:
y.value_counts()

15    479
10    354
20    276
5     206
0     158
25    114
30     58
35     24
40     12
45      9
50      4
Name: absenteeism, dtype: int64

In [188]:
X

Unnamed: 0,year,mounth,day,weekday,temp,pres,prec,wind
0,2012,1,14,6,-21,756,0,1
1,2012,1,16,1,-19,758,0,2
2,2012,1,17,2,-27,770,0,1
3,2012,1,18,3,-28,776,0,2
4,2012,1,19,4,-27,778,0,2
...,...,...,...,...,...,...,...,...
1689,2018,12,25,2,-23,770,0,1
1690,2018,12,26,3,-22,770,0,0
1691,2018,12,27,4,-23,770,0,0
1692,2018,12,28,5,-25,766,0,0


In [189]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Дерево

In [190]:
kfold = StratifiedKFold(n_splits=3) # задаем объект для разделения данных на фолды при кросс-валидации

model = DecisionTreeClassifier()

params = {
          'max_depth': range(1,50,2)
         }  # перебираемые параметры модели

grid = GridSearchCV(model, params, cv=kfold, scoring='accuracy', n_jobs=-1)

In [191]:
grid.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
       

In [192]:
mod = grid.best_estimator_
mod

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=7, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [193]:
mod = grid.best_estimator_
y_test_pred = mod.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.3889980353634578

In [194]:
y_test.value_counts()

15    144
10    106
20     83
5      62
0      48
25     34
30     17
35      7
40      4
45      3
50      1
Name: absenteeism, dtype: int64

In [195]:
delta = [[i[0], int(i[1]), int(i[1]) - i[0]] for i in list(zip(y_test, y_pred))]
delta

[[0, 14, 14],
 [20, 5, -15],
 [5, 14, 9],
 [5, 34, 29],
 [15, 10, -5],
 [5, 10, 5],
 [0, 15, 15],
 [30, 15, -15],
 [15, 24, 9],
 [0, 5, 5],
 [25, 24, -1],
 [0, 10, 10],
 [25, 24, -1],
 [5, 14, 9],
 [0, 0, 0],
 [25, 0, -25],
 [10, 15, 5],
 [50, 10, -40],
 [35, 15, -20],
 [10, 5, -5],
 [0, 15, 15],
 [0, 15, 15],
 [15, 0, -15],
 [10, 15, 5],
 [25, 10, -15],
 [20, 14, -6],
 [15, 14, -1],
 [20, 14, -6],
 [10, 15, 5],
 [5, 24, 19],
 [15, 19, 4],
 [5, 19, 14],
 [10, 19, 9],
 [10, 10, 0],
 [5, 14, 9],
 [20, 10, -10],
 [20, 14, -6],
 [15, 14, -1],
 [15, 10, -5],
 [20, 19, -1],
 [20, 15, -5],
 [0, 0, 0],
 [10, 0, -10],
 [5, 15, 10],
 [5, 10, 5],
 [20, 5, -15],
 [10, 19, 9],
 [25, 10, -15],
 [15, 10, -5],
 [40, 15, -25],
 [15, 10, -5],
 [10, 10, 0],
 [10, 15, 5],
 [40, 5, -35],
 [15, 19, 4],
 [0, 14, 14],
 [20, 14, -6],
 [30, 15, -15],
 [20, 19, -1],
 [10, 34, 24],
 [15, 5, -10],
 [20, 10, -10],
 [15, 29, 14],
 [10, 19, 9],
 [15, 10, -5],
 [25, 10, -15],
 [10, 15, 5],
 [10, 10, 0],
 [20, 5, -15],

# Линейная регрессия

In [196]:
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [197]:
LR = LinearRegression()
model = LR.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [198]:
linear_scoring = model_selection.cross_val_score(model, X_train, y_train, scoring="neg_mean_absolute_error", cv=10)
print(f"{linear_scoring.mean()}, {linear_scoring.std()}")

-5.239058018740312, 0.3518288131984265


In [199]:
X_train

Unnamed: 0,year,mounth,day,weekday,temp,pres,prec,wind
327,2013,6,7,5,20,754,0,2
493,2014,4,2,3,16,750,0,3
726,2015,4,23,4,11,743,0,1
1172,2017,5,24,3,23,744,0,5
1549,2019,3,26,2,4,762,0,3
...,...,...,...,...,...,...,...,...
1482,2017,12,22,5,-12,757,0,1
367,2012,10,15,1,7,761,0,2
147,2011,11,2,3,-4,756,0,8
1567,2019,4,24,3,12,749,0,1


In [203]:
delta = [[i[0], int(i[1]), int(i[1]) - i[0]] for i in list(zip(y_test,  round(x*2/10)/2*10))]

print('\nIntercept: \n', model.intercept_)
print('\nScore test: \n', model.score(X_test, y_test))
print('\nScore train: \n', model.score(X_train, y_train))
print('\nMean absolute error: \n', metrics.mean_absolute_error(y_test, y_pred))
print('\nMedian absolute error: \n', metrics.median_absolute_error(y_test, y_pred), "\n")
print(*delta, sep="\n")

AttributeError: 'numpy.ndarray' object has no attribute 'apply'

In [202]:
model.predict([[2018, 12, 6, 5, -21, 770, 0, 1]])

array([23.73030053])