In [2]:
import pickle
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [3]:
def sum_off_quarters(arr):
    #возвращяет массив из 3 элементов 1 сумма элементов в 1 четверти 2 сума 2 и 3 четверти 3 сумма 4 четверти
    if len(arr) < 4:
        return [0,sum(arr),0]
    
    quarter_len = len(arr) // 4
    first_quarter = sum(arr[:quarter_len])
    last_quarter = sum(arr[-quarter_len:])                     
    mid = sum(arr) - first_quarter - last_quarter
    
    return [first_quarter,mid,last_quarter]


def generated_features(data):
    #функция для генерации всех новых признаков
    
    mask = data['values'].apply(lambda x: not any(np.isnan(i) for i in x))
    data_filtred = data[mask]
    dif = data['values'].size - data_filtred['values'].size
    data_nan = data[~mask]
    data_filtred = data_filtred.reset_index(drop=True)

    data_processed = pd.DataFrame({'id': data_filtred.id.values,
                                'mean': np.empty(data_filtred['dates'].size),
                               'variance': np.empty(data_filtred['dates'].size),})

    for i in range(data_filtred['dates'].size):
        data_processed.loc[i,'variance'] = data_filtred['values'][i].var()
        data_processed.loc[i,'mean'] = data_filtred['values'][i].mean()
    
    for i in range(3):  # Для первых трех коэффициентов
        data_processed[f'angle_coeff_{i}'] = np.empty(data_filtred['dates'].size)
        data_processed[f'abs_coeff_{i}'] = np.empty(data_filtred['dates'].size)
        if i != 0:
            data_processed[f'real_coeff_{i}'] = np.empty(data_filtred['dates'].size)
        
    for i, row in data_filtred.iterrows():
        # Применение дискретного преобразования Фурье к значениями временного ряда
        fft_coefficients = np.fft.fft(row['values'])
        for j in range(3):  # Для первых трех коэффициентов      
            data_processed.loc[i,f'angle_coeff_{j}'] = np.angle(fft_coefficients[j])
            data_processed.loc[i,f'abs_coeff_{j}'] = np.abs(fft_coefficients[j])
            if j != 0:
                data_processed.loc[i,f'real_coeff_{j}'] = np.real(fft_coefficients[j])
                
    quantiels = [0.1,0.3,0.5,0.7,0.9]
    for i in quantiels:
        data_processed[f'quantile_{i}'] = np.empty(data_filtred['dates'].size)

    for i in range(data_filtred['dates'].size):
        for j in quantiels:
            data_processed.loc[i,f'quantile_{j}'] = np.quantile(np.array(data_filtred.loc[i,'values']), j)
    
    quater_features = ['first_quarter','two_mid_quarter','last_quarter']
    for i in quater_features:
        data_processed[i] = np.empty(data_filtred['dates'].size)
        
    for i in range(data_filtred['dates'].size):
        quarters = sum_off_quarters(data_filtred['values'][i])
        data_processed.loc[i,'first_quarter'] = quarters[0]
        data_processed.loc[i,'two_mid_quarter'] = quarters[1]
        data_processed.loc[i,'last_quarter'] =  quarters[2]

    return data_processed, data_nan

In [4]:
def create_submission(data, model):
    #возвращяет DataFrame вероятностей принадлежности к классу 1
    data_feature, data_nan = generated_features(data)
    
    fiture_columns = data_feature.columns[data_feature.columns != 'id']
    
    data_predict_id = data_feature.id.values
    data_predict_value = model.predict_proba(data_feature[fiture_columns])[:,1]

    data_nan_id = data_nan.id.values
    
    submission_predict = pd.DataFrame({'id': data_predict_id, 'score': data_predict_value})
    submission_nan = pd.DataFrame({'id': data_nan_id, 'score': 0.6})
    
    submission_combined = pd.concat([submission_predict, submission_nan], ignore_index=True)
    return submission_combined

In [5]:
with open("model_RF.pickle", "rb") as file:
    model = pickle.load(file)
#предпологаеться что данные в формате parquet если в csv просто разкоментировать 2 строку и убрать 1
data = pd.read_parquet('test.parquet')
#data = pd.read_csv('test.csv')

In [6]:
submission = create_submission(data, model)

In [11]:
#отсортируем результаты в соответсвие с тем в каком порядке пришли данные тк до сортировки все данные с nan записываються в конец
submission_sorted = submission.set_index('id').reindex(data['id']).reset_index()


In [12]:
submission_sorted.to_csv(r'submission.csv', index=False)