In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
# from random import shuffle
from random import seed
seed(20)

In [3]:
from catboost import CatBoostRegressor, Pool, cv
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import seaborn as sns

In [4]:
class read_data():
    def read_data_files():
        bookings = pd.read_csv(r"C:\Users\payoj.jain/Documents/projects/Europe Revenue estimation/data/Booking_level_data.csv")
        date_to_week = pd.read_csv(r"C:\Users\payoj.jain/Documents/projects/Europe Revenue estimation/data/weeks_start_date.csv")
        accommodations = pd.read_csv(r"C:\Users\payoj.jain/Documents/projects/Europe Revenue estimation/data/X_data.csv")
        weekly_availability = pd.read_csv(r"C:\Users\payoj.jain/Documents/projects/Europe Revenue estimation/data/week_wise_availability.csv")
        return bookings, date_to_week, accommodations, weekly_availability
    
    def read_model_data(country):
        return pd.read_csv(r"C:\Users\payoj.jain/Documents/projects/Europe Revenue estimation/data/"+country+"/model_data.csv")
    
    def read_corrupt_data(country):
        return pd.read_csv(r"C:\Users\payoj.jain/Documents/projects/Europe Revenue estimation/data/"+country+"/corrupt_data.csv")
    
    def read_cleaned_data(country):
        return pd.read_csv(r"C:\Users\payoj.jain/Documents/projects/Europe Revenue estimation/data/"+country+"/cleaned_data.csv")

In [None]:
class write_data():
    def write_model_data(model_data, country):
        model_data.to_csv(r"C:\Users\payoj.jain/Documents/projects/Europe Revenue estimation/data/"+country+"/model_data.csv")
    
    def write_corrupt_data(corrupted_data, country):
        corrupted_data.to_csv(r"C:\Users\payoj.jain/Documents/projects/Europe Revenue estimation/data/"+country+"/corrupt_data.csv")
        
    def write_cleaned_data(cleaned_data, country):
        cleaned_data.to_csv(r"C:\Users\payoj.jain/Documents/projects/Europe Revenue estimation/data/"+country+"/cleaned_data.csv")

In [8]:
class prepare_data():
    def __init__(self, country, bookings, date_to_week, weekly_availability, day_wise_bookings, accommodations):
        self.country = country
        self.bookings = bookings
        self.date_to_week = date_to_week
        self.weekly_availability = weekly_availability
        self.accommodations = accommodations
        
        self.country_daywise_bookings = pd.DataFrame(columns=["ACCOMMODATION_CODE", "BOOKING_ID", "arrivaldate", "departuredate","bookingdate", "date", "revenue"])
        self.country_daywise_week = pd.DataFrame(columns=["year", "week", "week_start", "week_end","date"])
    
    def convert_to_datetime(self):
        self.bookings['arrivaldate'] = pd.to_datetime(self.bookings['arrivaldate'])
        self.bookings['departuredate'] = pd.to_datetime(self.bookings['departuredate'])
        self.bookings['bookingdate'] = pd.to_datetime(self.bookings['bookingdate'])
        self.date_to_week["WK_START"] = pd.to_datetime(self.date_to_week["WK_START"])
        
    def add_week_end(self):
        self.date_to_week["WK_END"] = self.date_to_week["WK_START"] + datetime.timedelta(7)
        
    def country_bookings(self):
        self.country_bookings = self.bookings[(self.bookings["ACCOMMODATION_CODE"].str.contains(self.country))&(self.bookings["BOOKING_STATUS"]=="BOOKING")]
        
    def prepare_country_daywise_bookings(self):
        for booking in country_bookings.values:
            days_of_bookings = int((booking[5]-booking[4])/np.timedelta64(1,'D'))
            rev_per_day = booking[7]/days_of_bookings
            for i in range(days_of_bookings):
                date = booking[4] + datetime.timedelta(i)
                day_booking = pd.DataFrame([(booking[0], booking[1], booking[4], booking[5], booking[6], date, rev_per_day)], columns=["ACCOMMODATION_CODE", "BOOKING_ID", "arrivaldate", "departuredate","bookingdate", "date", "revenue"])
                self.country_daywise_bookings = self.country_daywise_bookings.append(day_booking)
        self.country_daywise_bookings = self.country_daywise_bookings.reset_index().drop(["index"], axis=1)
        
    def prepare_country_daywise_week(self):
        for week in self.date_to_week.values:
            days_of_booking = int(((week[3]-week[2]) / np.timedelta64(1, 'D')))
        #     rev_per_day = booking[7]/days_of_booking
            for i in range(days_of_booking):
                count+= 1
                date = week[2] + datetime.timedelta(i)
                week_df = pd.DataFrame([(week[0], week[1], week[2], week[3], date)], columns=["year", "week", "week_start", "week_end","date"])
                self.country_daywise_week = self.country_daywise_week.append(week_df)
        self.country_daywise_week = self.country_daywise_week.reset_index().drop(["index"], axis=1)
        
    def merge_country_daywise_boookings_with_week(self):
        self.country_daywise_bookings = pd.merge(self.country_daywise_bookings, self.country_daywise_week, on="date", how="left")
    
    def get_country_weekly_revenue(self):
        country_weekly_bookings = self.country_daywise_bookings.groupby(["year","week","ACCOMMODATION_CODE"]).sum()
        country_weekly_bookings = country_weekly_bookings.reset_index()
        return country_weekly_bookings
    
    def get_country_weekly_occupancy(self):
        country_weekly_occupancy = self.country_daywise_bookings.groupby(["year","week","ACCOMMODATION_CODE"]).count()["date"]
        country_weekly_occupancy = country_weekly_occupancy.reset_index()
        country_weekly_occupancy = country_weekly_occupancy.rename(columns={'date':'#_days_booked'})
        return country_weekly_occupancy
    
    def merge_country_weekly_revenue_and_occupancy(country_weekly_bookings, country_weekly_occupancy):
        country_weekly_revenue_occupancy = pd.merge(country_weekly_bookings, country_weekly_occupancy, on=["year","week","ACCOMMODATION_CODE"], how="left")
        return country_weekly_revenue_occupancy
    
    def get_country_availability(self):
        country_weekly_availability = self.weekly_availability[self.weekly_availability["ACCOMMODATION_CODE"].str.contains(self.country)]
        return country_weekly_availability
    
    def merge_country_bookings_with_weekly_availability(country_weekly_revenue_occupancy, country_weekly_availability):
        country_bookings_with_availability = pd.merge(country_weekly_availability, country_weekly_revenue_occupancy, on=["year", "week", "ACCOMMODATION_CODE"], how="outer")
        country_bookings_with_availability = country_bookings_with_availability.fillna(0)
        return country_bookings_with_availability
    
    def get_country_accommodations(self):
        return self.accommodations[self.accommodations["COUNTRY"] == self.country]
    
    def merge_accommodations_with_their_weekly_revenue(country_accommodations, country_bookings_with_availability):
        data = pd.merge(country_bookings_with_availability, country_accommodations, on="ACCOMMODATION_CODE", how="left")
        return data
    
    def find_corrupt_data(data):
        corrupted_data = data[(data["availablity_fin"] == 0) & ((data["occupancy"] > 0) | (data["revenue"] > 0))]
        corrupted_data = pd.concat([corrupted_data,data[(data["revenue"] > data["total2019"] + data["total2018"]) & (data["year"] < 2020)]], axis=0)
        corrupted_data = pd.concat([corrupted_data, data[data["availablity_fin"].isnull()]], axis=0)
        corrupted_data = pd.concat([corrupted_data, data[data["ACCOMMODATION_TYPE"].isnull()]], axis=0)
        corrupted_data = corrupted_data.reset_index()
        return corrupted_data
    
    def get_cleaned_data(model_data, corrupted_data):
        cleaned_data = model_data.drop(model_data.index[corrupted_data.index])
        cleaned_data = cleaned_data.reset_index().drop(["Unnamed: 0", "index"], axis=0)
        cleaned_data["distance_from_coast"] = cleaned_data["distance_from_coast"].fillna(-999)
        return cleaned_data

In [None]:
class Model():
    def __init__(self, country, iteration, train, test, features, output, ):
        