In [152]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [153]:
df = pd.read_csv("data.csv", sep = ";")

In [154]:
df["absenteeism"] = df["percent.1"].apply(lambda x: round(float(x.replace(",", "."))*2/10)/2*10)

In [155]:
df['prec'] = df['prec'].apply({'snow':2, 'rain':1, "storm":3,np.nan : 0}.get) 

In [156]:
df = df.drop(["Unnamed: 0", "percent.1", "percent","percentNormalized"], axis = 1)

In [157]:
df

Unnamed: 0,date,year,mounth,day,absenteeism,weekday,temp,pres,prec,wind
0,20120114,2012,1,14,20.0,6,-21,756,0,1.0
1,20120116,2012,1,16,15.0,1,-19,758,0,2.0
2,20120117,2012,1,17,15.0,2,-27,770,0,1.0
3,20120118,2012,1,18,30.0,3,-28,776,0,2.0
4,20120119,2012,1,19,20.0,4,-27,778,0,2.0
...,...,...,...,...,...,...,...,...,...,...
1689,20181225,2018,12,25,10.0,2,-23,770,0,1.0
1690,20181226,2018,12,26,5.0,3,-22,770,0,
1691,20181227,2018,12,27,5.0,4,-23,770,0,
1692,20181228,2018,12,28,0.0,5,-25,766,0,


In [158]:
X = df.drop(["date"], axis =1)
X["prec"].value_counts()

0    1457
2     174
1      57
3       6
Name: prec, dtype: int64

In [159]:
X = X.apply(pd.to_numeric).fillna(0).astype("int")

In [160]:
y = df["absenteeism"].astype("int")

In [161]:
y

0       20
1       15
2       15
3       30
4       20
        ..
1689    10
1690     5
1691     5
1692     0
1693     0
Name: absenteeism, Length: 1694, dtype: int64

In [162]:
y.value_counts()

15    479
10    354
20    276
5     206
0     158
25    114
30     58
35     24
40     12
45      9
50      4
Name: absenteeism, dtype: int64

In [163]:
X

Unnamed: 0,year,mounth,day,absenteeism,weekday,temp,pres,prec,wind
0,2012,1,14,20,6,-21,756,0,1
1,2012,1,16,15,1,-19,758,0,2
2,2012,1,17,15,2,-27,770,0,1
3,2012,1,18,30,3,-28,776,0,2
4,2012,1,19,20,4,-27,778,0,2
...,...,...,...,...,...,...,...,...,...
1689,2018,12,25,10,2,-23,770,0,1
1690,2018,12,26,5,3,-22,770,0,0
1691,2018,12,27,5,4,-23,770,0,0
1692,2018,12,28,0,5,-25,766,0,0


In [164]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Дерево

In [165]:
kfold = StratifiedKFold(n_splits=3) # задаем объект для разделения данных на фолды при кросс-валидации

model = DecisionTreeClassifier()

params = {
          'max_depth': range(1,50,2)
         }  # перебираемые параметры модели

grid = GridSearchCV(model, params, cv=kfold, scoring='accuracy', n_jobs=-1)

In [166]:
grid.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
       

In [167]:
mod = grid.best_estimator_
mod

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=11, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [168]:
mod = grid.best_estimator_
y_test_pred = mod.predict(X_test)
accuracy_score(y_test, y_test_pred)

1.0

In [169]:
y_test.value_counts()

15    144
10    106
20     83
5      62
0      48
25     34
30     17
35      7
40      4
45      3
50      1
Name: absenteeism, dtype: int64

In [170]:
print(*list(zip(y_test, y_test_pred)), sep="\n")

(15, 15)
(5, 5)
(15, 15)
(35, 35)
(10, 10)
(10, 10)
(15, 15)
(15, 15)
(25, 25)
(5, 5)
(25, 25)
(10, 10)
(25, 25)
(15, 15)
(0, 0)
(0, 0)
(15, 15)
(10, 10)
(15, 15)
(5, 5)
(15, 15)
(15, 15)
(0, 0)
(15, 15)
(10, 10)
(15, 15)
(15, 15)
(15, 15)
(15, 15)
(25, 25)
(20, 20)
(20, 20)
(20, 20)
(10, 10)
(15, 15)
(10, 10)
(15, 15)
(15, 15)
(10, 10)
(20, 20)
(15, 15)
(0, 0)
(0, 0)
(15, 15)
(10, 10)
(5, 5)
(20, 20)
(10, 10)
(10, 10)
(15, 15)
(10, 10)
(10, 10)
(15, 15)
(5, 5)
(20, 20)
(15, 15)
(15, 15)
(15, 15)
(20, 20)
(35, 35)
(5, 5)
(10, 10)
(30, 30)
(20, 20)
(10, 10)
(10, 10)
(15, 15)
(10, 10)
(5, 5)
(15, 15)
(15, 15)
(20, 20)
(5, 5)
(10, 10)
(10, 10)
(15, 15)
(20, 20)
(20, 20)
(10, 10)
(15, 15)
(15, 15)
(10, 10)
(15, 15)
(25, 25)
(20, 20)
(20, 20)
(5, 5)
(30, 30)
(15, 15)
(25, 25)
(15, 15)
(0, 0)
(20, 20)
(0, 0)
(15, 15)
(20, 20)
(5, 5)
(25, 25)
(20, 20)
(20, 20)
(15, 15)
(15, 15)
(5, 5)
(20, 20)
(25, 25)
(0, 0)
(10, 10)
(10, 10)
(15, 15)
(15, 15)
(20, 20)
(10, 10)
(15, 15)
(15, 15)
(5, 5)
(5, 5

# Линейная регрессия

In [171]:
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [172]:
LR = LinearRegression()
model = LR.fit(X_train, y_train)
model.coef_

array([ 9.28018024e-16,  4.44089210e-16, -3.46944695e-18,  1.00000000e+00,
       -2.16840434e-17,  2.08166817e-17,  0.00000000e+00,  7.28583860e-17,
        1.66533454e-16])