In [66]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklift.metrics import uplift_at_k
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier

In [None]:
def process_parser():
    additional_data = pd.read_csv("./data/additional_data_all_region.csv",sep=",")
    additional_data = additional_data[additional_data.tariff_name.isin(["Технологии доступа","Игровой", "Технологии доступа PRO", "Технологии контроля", "Апгрейд"]) == True]
    additional_data = additional_data[additional_data.is_correct_info == True]
    df = additional_data.pivot_table(index='city', columns='tariff_name', values='tariff_price').fillna(-1)
    df.to_csv("./data/additional_data_all_cities.csv", sep=",")

In [None]:
test_init = pd.read_csv("./data/test_catugra.csv",sep=",",index_col=0)
train_init = pd.read_csv("./data/train_catugra.csv",sep=",",index_col=0)
def prepare_dataset(test_init, paths):
    dfs = pd.read_csv("./data/additional_data_all_cities.csv",sep=",",index_col=0)
    test_init['city'] = test_init['city_name']
    df3 = dfs.merge(test_init, on='city', how='right')
    df3.Апгрейд.isna().sum()
    df3['Апгрейд'] = df3.groupby('subject_name')['Апгрейд'].transform(lambda x: x.fillna(x.min()))
    df3['Игровой'] = df3.groupby('subject_name')['Игровой'].transform(lambda x: x.fillna(x.min()))
    df3['Технологии доступа'] = df3.groupby('subject_name')['Технологии доступа'].transform(lambda x: x.fillna(x.min()))
    df3['Технологии доступа PRO'] = df3.groupby('subject_name')['Технологии доступа PRO'].transform(lambda x: x.fillna(x.min()))
    df3['Технологии контроля'] = df3.groupby('subject_name')['Технологии контроля'].transform(lambda x: x.fillna(x.min()))
    df3.drop(['city'], axis=1, inplace=True)
    df3.isna().sum()
    df3.to_csv(paths, sep=";")

In [None]:
prepare_dataset(train_init, "./data/train_catugra_merged_table.csv")
prepare_dataset(test_init, "./data/test_catugra_merged_table.csv")

In [70]:

df_train = pd.read_csv('./data/train_catugra_merged_table.csv', index_col=0, sep=";")
df_test = pd.read_csv('./data/test_catugra_merged_table.csv', index_col=0, sep=";")

df_train

Unnamed: 0,Апгрейд,Игровой,Технологии доступа,Технологии доступа PRO,Технологии контроля,label,period,subject_type,subject_name,city_name,...,children_subject,rural_subject,services_subject,communication_subject,district_population,mean_income_district,children_district,rural_district,services_district,communication_district
0,500.0,890.0,500.0,890.0,740.0,1,2020-05-01,Город,Москва,Москва,...,15.6,1.6,34.1,3.3,39251.0,46880.0,16.6,17.6,28.7,3.3
1,500.0,890.0,500.0,890.0,740.0,1,2020-05-01,Город,Москва,Москва,...,15.6,1.6,34.1,3.3,39251.0,46880.0,16.6,17.6,28.7,3.3
2,500.0,890.0,500.0,890.0,740.0,1,2020-05-01,Город,Москва,Москва,...,15.6,1.6,34.1,3.3,39251.0,46880.0,16.6,17.6,28.7,3.3
3,500.0,890.0,500.0,890.0,740.0,1,2020-05-01,Город,Москва,Москва,...,15.6,1.6,34.1,3.3,39251.0,46880.0,16.6,17.6,28.7,3.3
4,500.0,890.0,500.0,890.0,740.0,1,2020-05-01,Город,Москва,Москва,...,15.6,1.6,34.1,3.3,39251.0,46880.0,16.6,17.6,28.7,3.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294253,-1.0,2500.0,1800.0,2490.0,2465.0,0,2020-12-01,Область,Магаданская,Ола,...,18.6,3.9,30.6,4.1,8124.0,39822.4,20.8,27.0,25.5,3.7
294254,-1.0,2500.0,1800.0,2490.0,2465.0,0,2020-12-01,Область,Магаданская,Ола,...,18.6,3.9,30.6,4.1,8124.0,39822.4,20.8,27.0,25.5,3.7
294255,-1.0,2500.0,1800.0,2490.0,2465.0,0,2020-12-01,Область,Магаданская,Ола,...,18.6,3.9,30.6,4.1,8124.0,39822.4,20.8,27.0,25.5,3.7
294256,500.0,1890.0,1500.0,1890.0,1740.0,0,2020-12-01,Автономный Округ,Чукотский,Певек,...,22.2,28.8,26.8,4.6,8124.0,39822.4,20.8,27.0,25.5,3.7


In [71]:
def combine_covid(df_test, paths):
    df_covid = pd.read_csv('./data/owid-covid-data.csv')
    df_covid.drop(['continent','location','new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
       'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred',
       'new_vaccinations_smoothed_per_million',
       'new_people_vaccinated_smoothed',
       'new_people_vaccinated_smoothed_per_hundred', 'stringency_index',
       'population', 'population_density', 'median_age', 'aged_65_older',
       'aged_70_older', 'gdp_per_capita', 'extreme_poverty',
       'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers',
       'male_smokers', 'handwashing_facilities', 'hospital_beds_per_thousand',
       'life_expectancy', 'human_development_index',
       'excess_mortality_cumulative_absolute', 'excess_mortality_cumulative',
       'excess_mortality', 'excess_mortality_cumulative_per_million',], axis=1, inplace=True)
    df_covid.drop(["iso_code"], axis=1, inplace=True)
    df_covid['period'] = df_covid['date']  
    df_covid.drop(["date"], axis=1, inplace=True)
    df3 = df_covid.merge(df_test, on='period', how='right')
    df3.to_csv(paths, sep=";", index=False)

In [72]:
combine_covid(df_train,'./data/train_catugra_merged_table_with_covid.csv')
combine_covid(df_test,'./data/test_catugra_merged_table_with_covid.csv')