In [1]:
import pandas as pd
df = pd.read_parquet('../data/train.parquet')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31634 entries, 0 to 31633
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   reservation_id      31634 non-null  int64         
 1   night_number        31634 non-null  float64       
 2   stay_date           31634 non-null  datetime64[ns]
 3   guest_id            31634 non-null  int64         
 4   guest_country_id    31634 non-null  object        
 5   reservation_status  31634 non-null  object        
 6   reservation_date    31634 non-null  object        
 7   date_from           31634 non-null  object        
 8   date_to             31634 non-null  object        
 9   resort_id           31634 non-null  int64         
 10  cancel_date         6151 non-null   object        
 11  room_cnt            31634 non-null  int64         
 12  adult_cnt           31634 non-null  int64         
 13  children_cnt        31634 non-null  int64     

In [64]:
print(df['guest_country_id'])


0         HR
1         HR
2         HR
3         GB
4         HR
        ... 
31629     HR
31630      I
31631     HR
31632    SLO
31633     HR
Name: guest_country_id, Length: 31634, dtype: object


In [65]:
df['reservation_status'].value_counts()

reservation_status
Checked-out    25483
Cancelled       6038
No-show          113
Name: count, dtype: int64

In [66]:
status_mapping = {'Check-Out': 0, 'Canceled': 1, 'No-Show': 2}

df['reservation_status'] = df['reservation_status'].map(status_mapping)

Ne mozemo bas da predvidjamo da li ce nam neko odustati, to nam nije zadatak, tako da predvidjamo samo za one koji su check inovali i checkoutovali, tj. zapravo okupirali hotele, pa zato filtriram sledece:

In [67]:
df = df[df['reservation_status']==0]

Sad otklanjam nepotrebne kolone

In [68]:
df = df.drop(columns=['cancel_date','reservation_status','reservation_date'], axis = 1)

Kako ne bismo povecavali dimenzije dataseta drasticno (putem dummy variables), mapiramo numericki sve drzave u jednoj koloni, mada je moguce da ovo nije bas najbolja praksa, zbog kategorijske prirode drzava, mapirati ih direktno u skalare mozda nije u potpunosti mudro

In [69]:
df['zemlja_gosta_mapped'] = pd.factorize(df['guest_country_id'])[0]
print(df[['guest_country_id', 'zemlja_gosta_mapped']])

Empty DataFrame
Columns: [guest_country_id, zemlja_gosta_mapped]
Index: []


In [70]:
df = df.drop(columns='guest_country_id', axis = 1)                             

In [71]:
df['room_cnt'] = df['children_cnt'] + df['adult_cnt']

In [72]:
df = df.drop(columns = ['children_cnt', 'adult_cnt'], axis = 1)

In [73]:
df = df.reset_index()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   index                0 non-null      int64         
 1   reservation_id       0 non-null      int64         
 2   night_number         0 non-null      float64       
 3   stay_date            0 non-null      datetime64[ns]
 4   guest_id             0 non-null      int64         
 5   date_from            0 non-null      object        
 6   date_to              0 non-null      object        
 7   resort_id            0 non-null      int64         
 8   room_cnt             0 non-null      int64         
 9   price                0 non-null      float64       
 10  price_tax            0 non-null      float64       
 11  total_price_tax      0 non-null      float64       
 12  total_price          0 non-null      float64       
 13  food_price           0 non-null      float64   

Razdvajamo u dva hotela jer cenim da ce oni imati razlicite sobe barem

In [74]:
df1 = df[df['resort_id']==1]
df2 = df[df['resort_id']!=1]

Ovo sam eksportovao kasnije mozda za loadovanje, mada nije neophodno

In [75]:
df1.to_csv('../prep_data/prepped1_new.csv')
df2.to_csv('../prep_data/prepped0_new.csv')

In [76]:
# Ensure the 'date_from' column is in datetime format
#df['date_from'] = pd.to_datetime(df['date_from'])
#df['date_to'] = pd.to_datetime(df['date_to'])
# Set 'date_from' as the index


Ovde sam relativno uspesno pokusao da napravim funkciju koja sracunava broj gostiju za svaki datum.. mada mi se cini da je moglo jednostavnije. Al u sustini ovo uzima intervale ako su izmedju datuma dolaska i rezervacije i dodaje na njih broj_gostiju te konkretne rezervacije.

Pritom je wildly neefikasan algoritam moglo bi zasigurno bolje al ovo je za sad

In [77]:
def calculate_total_occupancy(input_df, frequency='D'):
    df = input_df.copy()

    date_range = pd.date_range(df['date_from'].min(), df['date_to'].max(), freq=frequency)

    occupancy_df = pd.DataFrame(index=date_range, columns=['total_occupancy'])

    for _, reservation in df.iterrows():
        occupied_days = pd.date_range(reservation['date_from'], reservation['date_to'], freq=frequency)
        occupancy_df.loc[occupied_days, 'total_occupancy'] = occupancy_df.loc[occupied_days, 'total_occupancy'].add(reservation['room_cnt'], fill_value=0)

    # merdzujemo datume dolaska kako bi se mapirale vrednosti korektno
    df = pd.merge(df, occupancy_df, left_on='date_from', right_index=True, how='left')

    df.drop_duplicates(subset='date_from', keep='first', inplace=True)
    df = df[df['total_occupancy'].notnull()]
    return df[['date_from','total_occupancy']]

## Tu baci gresku indicates that either the minimum value (df['date_from'].min()) or the maximum value (df['date_to'].max()) in your DataFrame is a NaT (Not a Time) value, which is a special value denoting a missing or null date/time.###

#probao prebaciti u time series date from i to i opet neka druga gresk#

In [78]:
result_df1 = calculate_total_occupancy(df1)
print(result_df1)

ValueError: Neither `start` nor `end` can be NaT

In [None]:
result_df2 = calculate_total_occupancy(df2)
print(result_df2)

      datum_dolaska total_occupancy
0        2015-07-01            72.0
28       2015-07-02           140.0
54       2015-07-03           190.0
77       2015-07-04           248.0
100      2015-07-05           310.0
...             ...             ...
27580    2017-07-27           495.0
27679    2017-07-30           501.0
27737    2017-07-31           497.0
27902    2017-08-05           530.0
28717    2017-08-30           460.0

[793 rows x 2 columns]


Postavljam indekse radi eksportovanja

In [None]:
result_df1 = result_df1.set_index('datum_dolaska')
result_df2 = result_df2.set_index('datum_dolaska')

Napravite folder prep_data pre narednog cell-a

In [None]:
result_df2.to_csv('../prep_data/hotelid0_daily_occupancy.csv')
result_df1.to_csv('../prep_data/hotelid1_daily_occupancy.csv')

Sad treba isto uraditi za nedeljno i mesecno, naredni cell ce se dugo izvrsavati jer python a i kod mi nije najefikasniji i dalje, ali samo strpljivo

In [None]:
result_weekly_df1 = calculate_total_occupancy(df1, frequency='W-Mon') # W-Mon je za nedelje i da pocnu u ponedeljak
result_monthly_df1 = calculate_total_occupancy(df1, frequency='M')

result_weekly_df2 = calculate_total_occupancy(df2, frequency='W-Mon')
result_monthly_df2 = calculate_total_occupancy(df2, frequency='M')

In [None]:
result_weekly_df1  = result_weekly_df1.set_index('datum_dolaska')
result_monthly_df1 = result_monthly_df1.set_index('datum_dolaska')

result_weekly_df2  = result_weekly_df2.set_index('datum_dolaska')
result_monthly_df2 = result_monthly_df2.set_index('datum_dolaska')

In [None]:
result_weekly_df1.head()

Unnamed: 0_level_0,total_occupancy
datum_dolaska,Unnamed: 1_level_1
2015-05-04,82.0
2015-05-11,165.0
2015-05-18,167.0
2015-07-20,56.0
2015-11-23,236.0


In [None]:
result_monthly_df1.head()

Unnamed: 0_level_0,total_occupancy
datum_dolaska,Unnamed: 1_level_1
2015-11-30,241.0
2015-07-31,34.0
2015-05-31,95.0
2015-08-31,116.0
2015-01-31,9.0


In [None]:
df0_occupancy = pd.read_csv("../prep_data/hotelid0_daily_occupancy.csv")
df0_dataset = pd.read_csv("../prep_data/prepped0.csv")

df0_occupancy['datum_dolaska'] = pd.to_datetime(df0_occupancy['datum_dolaska'])
df0_dataset['datum_dolaska'] = pd.to_datetime(df0_dataset['datum_dolaska'])

df0 = pd.merge(df0_dataset, df0_occupancy, on='datum_dolaska', how='left')
df0 = df0.drop(columns = ["Unnamed: 0", "broj_gostiju","index","hotel_id"])
df0.to_csv('../prep_data/hotelid1_daily_merged.csv')

In [None]:
df1_occupancy = pd.read_csv("../prep_data/hotelid1_daily_occupancy.csv")
df1_dataset = pd.read_csv("../prep_data/prepped1.csv")

df1_occupancy['datum_dolaska'] = pd.to_datetime(df1_occupancy['datum_dolaska'])
df1_dataset['datum_dolaska'] = pd.to_datetime(df1_dataset['datum_dolaska'])

df1 = pd.merge(df1_dataset, df1_occupancy, on='datum_dolaska', how='left')
df1 = df1.drop(columns = ["Unnamed: 0", "broj_gostiju","index","hotel_id"])
df1.to_csv('../prep_data/hotelid1_daily_merged.csv')