In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.model_selection import cross_val_score, cross_validate, KFold, LeaveOneOut, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('pulsar_stars.csv', sep=',')
DataParam = data[[' Mean of the integrated profile',' Standard deviation of the integrated profile',' Excess kurtosis of the integrated profile',' Skewness of the integrated profile',' Mean of the DM-SNR curve',' Standard deviation of the DM-SNR curve',' Excess kurtosis of the DM-SNR curve',' Skewness of the DM-SNR curve']]
DataAnsw = data['target_class']
data.dtypes

 Mean of the integrated profile                  float64
 Standard deviation of the integrated profile    float64
 Excess kurtosis of the integrated profile       float64
 Skewness of the integrated profile              float64
 Mean of the DM-SNR curve                        float64
 Standard deviation of the DM-SNR curve          float64
 Excess kurtosis of the DM-SNR curve             float64
 Skewness of the DM-SNR curve                    float64
target_class                                       int64
dtype: object

In [3]:
data.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [4]:
data.isnull().sum()

 Mean of the integrated profile                  0
 Standard deviation of the integrated profile    0
 Excess kurtosis of the integrated profile       0
 Skewness of the integrated profile              0
 Mean of the DM-SNR curve                        0
 Standard deviation of the DM-SNR curve          0
 Excess kurtosis of the DM-SNR curve             0
 Skewness of the DM-SNR curve                    0
target_class                                     0
dtype: int64

In [5]:
TrainX,TestX,TrainY,TestY = train_test_split(DataParam,DataAnsw, test_size=0.3, random_state = 1)

In [6]:
def GraphBuilder(ArrX,ArrY,Title,LabelX,LabelY):
    fig = plt.figure()
    ax = fig.add_axes([0, 0, 1, 1])
    plt.plot(ArrX, ArrY)
    plt.grid(True)
    plt.title(Title, fontsize=16)
    plt.ylabel(LabelY, fontsize=12)
    plt.xlabel(LabelX, fontsize=12)
    plt.show()

In [7]:
def RegGridSearchCV(model, NEstimatorsTP, dataParam, dataAnsw):
    Standard_deviation_List  = []
    Excess_kurtosis_List = []
    Skewness_List = []
    for param in NEstimatorsTP:
        ModelGS = model(n_estimators=param, random_state = 0)
        Standard_deviation_List.append(abs(cross_val_score(ModelGS, dataParam, dataAnsw, cv =10, scoring='neg_mean_absolute_error').mean()))
        Excess_kurtosis_List.append(abs(cross_val_score(ModelGS, dataParam, dataAnsw, cv =10, scoring='neg_mean_squared_error').mean()))
        Skewness_List.append(abs(cross_val_score(ModelGS, dataParam, dataAnsw, cv =10, scoring='r2').mean()))
    return Standard_deviation_List, Excess_kurtosis_List, Skewness_List

<a href='https://ru.wikipedia.org/wiki/Random_forest'><h1>Случайный лес</h1></a>

In [8]:
RandForest5 = RandomForestRegressor(n_estimators=5)
RandForest5.fit(TrainX,TrainY)
RFPredict5 = RandForest5.predict(TestX)
print("Средняя абсолютная ошибка:\t",round(mean_absolute_error(TestY, RFPredict5),6))
print("Средняя квадратичная ошибка:\t",round(mean_squared_error(TestY, RFPredict5),6))
print("Коэффициент детерминации:\t",round(r2_score(TestY, RFPredict5),6))

Средняя абсолютная ошибка:	 0.031061
Средняя квадратичная ошибка:	 0.019471
Коэффициент детерминации:	 0.765186


In [10]:
RFNEstimatorsTP = range(1,50)
RFDeviationList,RFKurtosisList,RFSkewnessList = RegGridSearchCV(RandomForestRegressor,RFNEstimatorsTP,DataParam,DataAnsw)

KeyboardInterrupt: 

In [None]:
GraphBuilder(RFNEstimatorsTP, RFDeviationList,'Зависимость Deviation of the DM-SNR curve от числа деревьев','Число деревьев',' Standard deviation of the DM-SNR curve')
GraphBuilder(RFNEstimatorsTP, RFKurtosisList,'Зависимость Excess kurtosis of the DM-SNR curve от числа деревьев','Число деревьев',' Excess kurtosis of the DM-SNR curve')
GraphBuilder(RFNEstimatorsTP, RFSkewnessList,'Зависимость Skewness of the DM-SNR curve от числа деревьев','Число деревьев',' Skewness of the DM-SNR curve')

In [None]:
GraphBuilder(NEstimatorsTP[79:100], RFMAEList[79:100],'Зависимость Deviation of the DM-SNR curve от числа деревьев','Число деревьев',' Standard deviation of the DM-SNR curve
')
GraphBuilder(NEstimatorsTP[79:100], RFMSEList[79:100],'Зависимость Excess kurtosis of the DM-SNR curve от числа деревьев','Число деревьев',' Excess kurtosis of the DM-SNR curve')
GraphBuilder(NEstimatorsTP[79:100], RFR2SList[79:100],'Зависимость Skewness of the DM-SNR curve от числа деревьев','Число деревьев',' Skewness of the DM-SNR curve')

In [None]:
RandForestBP = RandomForestRegressor(n_estimators=88)
RandForestBP.fit(TrainX,TrainY)
RFPredictBP = RandForestBP.predict(TestX)
print("Средняя абсолютная ошибка:\t",round(mean_absolute_error(TestY, RFPredictBP),6))
print("Средняя квадратичная ошибка:\t",round(mean_squared_error(TestY, RFPredictBP),6))
print("Коэффициент детерминации:\t",round(r2_score(TestY, RFPredictBP),6))

<a href='https://tech.yandex.ru/catboost/'><h1>Градиентный бустинг</h1></a>

In [None]:
pip install catboost

In [None]:
import numpy as np
from catboost import Pool, CatBoostRegressor

In [None]:
model = CatBoostRegressor(n_estimators=20)
model.fit(TrainX,TrainY)
modelPredict = model.predict(TestX)
print("Средняя абсолютная ошибка:\t",round(mean_absolute_error(TestY, modelPredict),6))
print("Средняя квадратичная ошибка:\t",round(mean_squared_error(TestY, modelPredict),6))
print("Коэффициент детерминации:\t",round(r2_score(TestY, modelPredict),6))

In [None]:
GBNEstimatorsTP = range(1,50)
GBDeviationList,GBKurtosisList,GBSkewnessList = RegGridSearchCV(CatBoostRegressor,RFNEstimatorsTP,DataParam,DataAnsw)

In [None]:
GraphBuilder(GBNEstimatorsTP, GBDeviationList,'Зависимость Deviation of the DM-SNR curve от числа деревьев','Число деревьев',' Standard deviation of the DM-SNR curve')
GraphBuilder(GBNEstimatorsTP, GBKurtosisList,'Зависимость Excess kurtosis of the DM-SNR curve от числа деревьев','Число деревьев',' Excess kurtosis of the DM-SNR curve')
GraphBuilder(GBNEstimatorsTP, GBSkewnessList,'Зависимость Skewness of the DM-SNR curve от числа деревьев','Число деревьев',' Skewness of the DM-SNR curve')

In [None]:
GraphBuilder(GBNEstimatorsTP[29:60], GBDeviationList[29:60],'Зависимость Deviation of the DM-SNR curve от числа деревьев','Число деревьев',' Standard deviation of the DM-SNR curve')
GraphBuilder(GBNEstimatorsTP[29:60], GBKurtosisList[29:60],'Зависимость Excess kurtosis of the DM-SNR curve от числа деревьев','Число деревьев',' Excess kurtosis of the DM-SNR curve')
GraphBuilder(GBNEstimatorsTP[29:60], GBSkewnessList[29:60],'Зависимость Skewness of the DM-SNR curve от числа деревьев','Число деревьев',' Skewness of the DM-SNR curve')

In [None]:
model = CatBoostRegressor(n_estimators=41)
model.fit(TrainX,TrainY)
modelPredict = model.predict(TestX)
print("Средняя абсолютная ошибка:\t",round(mean_absolute_error(TestY, modelPredict),6))
print("Средняя квадратичная ошибка:\t",round(mean_squared_error(TestY, modelPredict),6))
print("Коэффициент детерминации:\t",round(r2_score(TestY, modelPredict),6))