In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [4]:
df = pd.read_csv("data.csv", sep = ";")

In [5]:
df["absenteeism"] = df["percent.1"].apply(lambda x: round(float(x.replace(",", "."))*2/10)/2*10/5)

In [6]:
df['prec'] = df['prec'].apply({'snow':2, 'rain':1, "storm":3,np.nan : 0}.get) 

In [7]:
df = df.drop(["Unnamed: 0", "percent.1", "percent","percentNormalized"], axis = 1)

In [8]:
df

Unnamed: 0,date,year,mounth,day,absenteeism,weekday,temp,pres,prec,wind
0,20120114,2012,1,14,4.0,6,-21,756,0,1.0
1,20120116,2012,1,16,3.0,1,-19,758,0,2.0
2,20120117,2012,1,17,3.0,2,-27,770,0,1.0
3,20120118,2012,1,18,6.0,3,-28,776,0,2.0
4,20120119,2012,1,19,4.0,4,-27,778,0,2.0
...,...,...,...,...,...,...,...,...,...,...
1689,20181225,2018,12,25,2.0,2,-23,770,0,1.0
1690,20181226,2018,12,26,1.0,3,-22,770,0,
1691,20181227,2018,12,27,1.0,4,-23,770,0,
1692,20181228,2018,12,28,0.0,5,-25,766,0,


In [9]:
X = df.drop(["date", "absenteeism"], axis =1)
X["prec"].value_counts()

0    1457
2     174
1      57
3       6
Name: prec, dtype: int64

In [10]:
X = X.apply(pd.to_numeric).fillna(0).astype("int")

In [11]:
y = df["absenteeism"].astype("int")

In [12]:
y

0       4
1       3
2       3
3       6
4       4
       ..
1689    2
1690    1
1691    1
1692    0
1693    0
Name: absenteeism, Length: 1694, dtype: int64

In [13]:
y.value_counts()

3     479
2     354
4     276
1     206
0     158
5     114
6      58
7      24
8      12
9       9
10      4
Name: absenteeism, dtype: int64

In [14]:
X

Unnamed: 0,year,mounth,day,weekday,temp,pres,prec,wind
0,2012,1,14,6,-21,756,0,1
1,2012,1,16,1,-19,758,0,2
2,2012,1,17,2,-27,770,0,1
3,2012,1,18,3,-28,776,0,2
4,2012,1,19,4,-27,778,0,2
...,...,...,...,...,...,...,...,...
1689,2018,12,25,2,-23,770,0,1
1690,2018,12,26,3,-22,770,0,0
1691,2018,12,27,4,-23,770,0,0
1692,2018,12,28,5,-25,766,0,0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Дерево

In [16]:
kfold = StratifiedKFold(n_splits=3) # задаем объект для разделения данных на фолды при кросс-валидации

model = DecisionTreeClassifier()

params = {
          'max_depth': range(1,50,2)
         }  # перебираемые параметры модели

grid = GridSearchCV(model, params, cv=kfold, scoring='accuracy', n_jobs=-1)

In [17]:
grid.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
       

In [18]:
mod = grid.best_estimator_
mod

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=7, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [19]:
mod = grid.best_estimator_
y_test_pred = mod.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.4086444007858546

In [20]:
y_test.value_counts()

3     144
2     106
4      83
1      62
0      48
5      34
6      17
7       7
8       4
9       3
10      1
Name: absenteeism, dtype: int64

In [21]:
delta = [[i[0], int(i[1]), int(i[1]) - i[0]] for i in list(zip(y_test, y_pred))]
delta

NameError: name 'y_pred' is not defined

# Линейная регрессия

In [22]:
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [23]:
LR = LinearRegression()
model = LR.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [24]:
linear_scoring = model_selection.cross_val_score(model, X_train, y_train, scoring="neg_mean_absolute_error", cv=10)
print(f"{linear_scoring.mean()}, {linear_scoring.std()}")

-1.0781856907500331, 0.09176339985363141


In [25]:
X_train

Unnamed: 0,year,mounth,day,weekday,temp,pres,prec,wind
1513,2019,2,5,2,-24,761,0,1
532,2014,5,20,2,14,745,0,4
1623,2018,10,3,3,8,755,0,2
1118,2017,3,11,6,-6,765,0,1
1226,2016,10,12,3,2,752,0,1
...,...,...,...,...,...,...,...,...
1295,2018,1,20,6,-28,767,0,2
1577,2019,5,7,2,21,757,0,2
638,2013,12,18,3,-10,760,0,5
544,2014,6,3,2,12,752,0,7


In [29]:
delta = [[i[0], int(i[1]), int(i[1]) - i[0]] for i in list(zip(y_test, y_pred))]

print('\nIntercept: \n', model.intercept_)
print('\nScore test: \n', model.score(X_test, y_test))
print('\nScore train: \n', model.score(X_train, y_train))
print('\nMean absolute error: \n', metrics.mean_absolute_error(y_test, y_pred))
print('\nMedian absolute error: \n', metrics.median_absolute_error(y_test, y_pred), "\n")
print(*delta, sep="\n")


Intercept: 
 -565.5421138637852

Score test: 
 0.3880968470240362

Score train: 
 0.3697129125168067

Mean absolute error: 
 1.004525790799306

Median absolute error: 
 0.7736762336461425 

[2, 2, 0]
[0, 1, 1]
[0, 2, 2]
[3, 4, 1]
[5, 4, -1]
[3, 1, -2]
[3, 3, 0]
[4, 4, 0]
[3, 3, 0]
[2, 0, -2]
[2, 1, -1]
[5, 4, -1]
[2, 4, 2]
[4, 4, 0]
[0, 2, 2]
[2, 1, -1]
[2, 3, 1]
[3, 2, -1]
[0, 2, 2]
[3, 1, -2]
[3, 3, 0]
[2, 1, -1]
[2, 4, 2]
[7, 4, -3]
[3, 2, -1]
[4, 2, -2]
[3, 2, -1]
[7, 3, -4]
[3, 2, -1]
[1, 0, -1]
[2, 2, 0]
[2, 3, 1]
[2, 1, -1]
[4, 1, -3]
[3, 2, -1]
[2, 3, 1]
[2, 3, 1]
[1, 3, 2]
[2, 1, -1]
[4, 3, -1]
[3, 2, -1]
[8, 5, -3]
[0, 0, 0]
[3, 3, 0]
[4, 3, -1]
[4, 4, 0]
[3, 4, 1]
[2, 1, -1]
[3, 3, 0]
[3, 3, 0]
[2, 2, 0]
[2, 4, 2]
[0, 1, 1]
[4, 2, -2]
[2, 1, -1]
[1, 2, 1]
[2, 2, 0]
[1, 1, 0]
[1, 1, 0]
[3, 3, 0]
[2, 1, -1]
[0, 1, 1]
[3, 3, 0]
[4, 4, 0]
[2, 1, -1]
[6, 4, -2]
[4, 3, -1]
[2, 3, 1]
[9, 6, -3]
[1, 0, -1]
[4, 4, 0]
[0, 2, 2]
[3, 4, 1]
[2, 3, 1]
[4, 4, 0]
[4, 1, -3]
[3, 3, 0]
[3, 2

In [32]:
model.predict([[2018, 12, 6, 5, -21, 770, 0, 1]])

array([4.74838129])