In [57]:
def SAVE_TO_FILE(df_, modelName,path):
    df_.to_csv(path + 'mmingalov_kaggle_math_exam(' + modelName + ').csv', index=False)

In [31]:
def calc_norm_fit(x):
    res = (x - x.min()) / (x.max() - x.min())
    return res

In [32]:
import matplotlib.pyplot as plt
%matplotlib inline
import random

from matplotlib.colors import ListedColormap
from sklearn import datasets

import numpy as np
import pandas as pd

In [2]:
class Node: 
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index  # индекс признака, по которому ведется сравнение с порогом в этом узле
        self.t = t  # значение порога
        self.true_branch = true_branch  # поддерево, удовлетворяющее условию в узле
        self.false_branch = false_branch  # поддерево, не удовлетворяющее условию в узле
class Leaf:
    
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels  # y_true
        self.prediction = self.predict()  # y_pred
        
    def predict(self):
        return np.mean(self.labels)
    
    def predict_dispersion(self):
        return np.var(self.labels, ddof=1)
    
    def predict_std(self):
        return np.std(self.labels, ddof=1)

In [4]:
class Tree:

    def __init__(self, max_depth=10, max_leaf_qty=1024):
        self.max_depth = max_depth
        self.max_leaf_qty = max_leaf_qty
        self.tree = None

    # дисперсия значений
    def dispersion(self, labels):
        return np.var(labels, ddof=1)
    
    # ср.кв.откл. значений
    def standard_deviation(self, labels):
        return np.std(labels, ddof=1)

    # качество
    def quality(self, left_labels, right_labels, current_dispersion):
        p = float(left_labels.shape[0]) / (left_labels.shape[0] + right_labels.shape[0])
        return current_dispersion - p * self.dispersion(left_labels) - (1 - p) * self.dispersion(right_labels)

    # Ветвление в узле
    def split(self, data, labels, index, t):
    
        left = np.where(data[:, index] <= t)
        right = np.where(data[:, index] > t)
        
        true_data = data[left]
        false_data = data[right]
        true_labels = labels[left]
        false_labels = labels[right]
        
        return true_data, false_data, true_labels, false_labels

    # Определение наилучшего разбиения
    def find_best_split(self, data, labels):
    
        #  минимально допустимое количество объектов в узле
        min_leaf = 5

        current_dispersion = self.dispersion(labels)

        best_quality = 0
        best_t = None
        best_index = None
    
        n_features = data.shape[1]
    
        for idx in range(n_features):
          # берем уникальные значения
            t_values = np.unique([item[idx] for item in data])
      
            for t in t_values:
                true_data, false_data, true_labels, false_labels = self.split(data, labels, idx, t)
                #  пропускаем разбиения, в которых в узле остается менее 5 объектов
                if len(true_data) < min_leaf or len(false_data) < min_leaf:
                    continue

                current_quality = self.quality(true_labels, false_labels, current_dispersion)
        
                #  выбираем порог, на котором получается максимальный прирост качества
                if current_quality > best_quality:
                    best_quality, best_t, best_index = current_quality, t, idx

        return best_quality, best_t, best_index

    # Построение дерева с помощью рекурсивной функции

    def build_tree(self, data, labels, tree_depth, max_depth=5, max_leaf_qty=32, min_per_leaf=5, classes_per_leaf=1, min_quality_gain=1e-4):

        quality, t, index = self.find_best_split(data, labels)

        #  Базовый случай - прекращаем рекурсию, когда нет прироста в качества
        if quality == 0:
            return Leaf(data, labels)
        
        # Прекращаем, если кол-во элементов в листе <= min_per_leaf
        if data.shape[0] <= min_per_leaf:
            return Leaf(data, labels)

        # Прекращаем, если кол-во классов в листе <= min_classes_per_leaf
        if len(set(labels)) <= classes_per_leaf:
            return Leaf(data, labels)
        
        # прекращаем рекурсию, когда достигнута максимальная глубина дерева
        if tree_depth >= max_depth:
            return Leaf(data, labels)

        # Увеличиваем глубину дерева на 1
        tree_depth += 1

        true_data, false_data, true_labels, false_labels = self.split(data, labels, index, t)

        # Рекурсивно строим два поддерева
        true_branch = self.build_tree(true_data, true_labels, tree_depth, max_depth)
        false_branch = self.build_tree(false_data, false_labels, tree_depth, max_depth)

        # Возвращаем узел
        return Node(index, t, true_branch, false_branch)

    def predict_object(self, obj, node):

        #  Останавливаем рекурсию, если достигли листа
        if isinstance(node, Leaf):
            answer = node.prediction
            return answer

        if obj[node.index] <= node.t:
            return self.predict_object(obj, node.true_branch)
        else:
            return self.predict_object(obj, node.false_branch)

    def predict(self, data):
    
        val = []
        for obj in data:
            prediction = self.predict_object(obj, self.tree)
            val.append(prediction)
        return val

    def fit(self, data, labels):
        self.tree = self.build_tree(data, labels, 0, self.max_depth)
        return self

In [None]:
### --------------------------------

In [33]:
PATH_FOLDER = 'D:\\Cloud\\Git\\geekbrains-data-analysis-alg\\tutors-expected-math-exam-results\\'
train = pd.read_csv(PATH_FOLDER + 'train.csv')
test = pd.read_csv(PATH_FOLDER + 'test.csv')
features = ['age','years_of_experience','lesson_price','qualification','physics',
            'chemistry','biology','english','geography','history']

X = np.array(calc_norm_fit(train[features]))# отбираем множество признаков и нормализуем
#train_f = calc_std_fit(train[features])# отбираем множество признаков и стандартизируем
y = np.array(train['mean_exam_points'])

classification_data = X
classification_labels = y

In [45]:
from sklearn import model_selection
train_data, valid_data, train_labels, valid_labels = model_selection.train_test_split(classification_data, 
                                                                                     classification_labels, 
                                                                                     test_size = 0.3,
                                                                                     random_state = 1)

In [46]:
tree = Tree()
tree.fit(train_data, train_labels)

<__main__.Tree at 0x25cd62ed948>

In [47]:
y_train_pred = tree.predict(train_data)
y_valid_pred = tree.predict(valid_data)

In [48]:
from sklearn.metrics import r2_score

coefficient_of_dermination = r2_score(train_labels, y_train_pred)
coefficient_of_dermination

0.8187033745197206

In [49]:
coefficient_of_dermination = r2_score(valid_labels, y_valid_pred)
coefficient_of_dermination

0.7492276782067132

In [50]:
X_test = np.array(calc_norm_fit(test[features]))# отбираем множество признаков и нормализуем

In [53]:
%%time
#финальный набор test, который нужно посчитать
y_test_pred = tree.predict(X_test)
#y_test_pred

Wall time: 202 ms


In [54]:
df = pd.DataFrame(y_test_pred)
df.columns = ['mean_exam_points']
df['Id'] = range(10000,len(df)+10000)
df_save = df[['Id','mean_exam_points']]

In [55]:
df_save

Unnamed: 0,Id,mean_exam_points
0,10000,55.329545
1,10001,65.894737
2,10002,51.333333
3,10003,89.333333
4,10004,89.058824
...,...,...
9995,19995,39.045455
9996,19996,79.750000
9997,19997,53.614286
9998,19998,64.586777


In [58]:
SAVE_TO_FILE(df_save,'DecisionTreeRegr', PATH_FOLDER)

In [None]:
PATH_FOLDER = 'D:\\Cloud\\Git\\geekbrains-data-analysis-alg\\tutors-expected-math-exam-results\\'
train = pd.read_csv(PATH_FOLDER + 'train.csv')
test = pd.read_csv(PATH_FOLDER + 'test.csv')
train['ones'] = 1 #
test['ones'] = 1 #

In [59]:
# интерквартильный размах
def IQ_processing(df_, list_):
    for i in list_:
        c = i
        IQP = IQ_param_get(df_,c)
        print(c,IQP)
        df_[(df_[c] < IQP['low_border']) | (df_[c]> IQP['high_border'])]
        df_.loc[df_[c] > IQP['high_border'], c] = IQP['median']
        df_.loc[df_[c] < IQP['low_border'], c] = IQP['median']
        print('count after procesing:',df_[(df_[c] < IQP['low_border']) | (df_[c]> IQP['high_border'])][c].count())

        # Обработка выбросов -- медианы для значений за пределами 
def IQ_param_get(df_, column_):
    m = df_[column_].median()
    c = column_
    IQ=df_[c].describe()['75%']-df_[c].describe()['25%']
    
    low_border=df_[c].describe()['25%']-IQ*1.5
    
    high_border=df_[c].describe()['75%']+IQ*1.5
 
    count1 = df_[(df_[c] < low_border) | (df_[c]> high_border)][c].count()
    
    result = {
        'IQ':IQ,
        'low_border':low_border,
        'high_border':high_border,
        'count': count1,
        'median': m
    }
    return result

In [60]:
# интерквартильный размах
list = ['lesson_price','age']
print('processing dataframe train')
IQ_processing(train, list)

processing dataframe train
lesson_price {'IQ': 850.0, 'low_border': 25.0, 'high_border': 3425.0, 'count': 25, 'median': 1500.0}
count after procesing: 0
age {'IQ': 11.0, 'low_border': 23.5, 'high_border': 67.5, 'count': 77, 'median': 46.0}
count after procesing: 0


In [61]:
X = np.array(calc_norm_fit(train[features]))# отбираем множество признаков и нормализуем
#train_f = calc_std_fit(train[features])# отбираем множество признаков и стандартизируем
y = np.array(train['mean_exam_points'])

classification_data = X
classification_labels = y

In [62]:
from sklearn import model_selection
train_data, valid_data, train_labels, valid_labels = model_selection.train_test_split(classification_data, 
                                                                                     classification_labels, 
                                                                                     test_size = 0.3,
                                                                                     random_state = 1)

In [63]:
tree = Tree()
tree.fit(train_data, train_labels)

<__main__.Tree at 0x25cd732e788>

In [64]:
y_train_pred = tree.predict(train_data)
y_valid_pred = tree.predict(valid_data)

In [65]:
from sklearn.metrics import r2_score

coefficient_of_dermination = r2_score(train_labels, y_train_pred)
coefficient_of_dermination

0.8158121115627444

In [66]:
coefficient_of_dermination = r2_score(valid_labels, y_valid_pred)
coefficient_of_dermination

0.7446713164004017