# Importing libraries and data

In [1]:
pip install plotly



In [2]:
import pandas as pd
import numpy as np
from datetime import date

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
import plotly.express as px
from plotly import graph_objects as go

In [5]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Loading users indexes:
perks = pd.read_csv('/content/drive/MyDrive/all_perks.csv')
print(perks.shape)

# Loading information about users:
users_data = pd.read_csv('/content/drive/MyDrive/user_data.csv')
print(users_data.shape)

(5998, 7)
(5998, 10)


In [9]:
# Filling NaN values with zero:
perks.fillna(0, inplace=True)
perks.head()

Unnamed: 0,user_id,hotel_hunter_index,average_bags_scaled,cancellation_rate_scaled,bargain_hunter_index,combined_booking_scaled,session_intencity_index
0,101961.0,0.0,0.08,0.0,5.8e-05,1.0,0.0
1,106907.0,0.0,1.0,0.5,0.0,1.0,0.0
2,181157.0,0.000495,0.2,0.0,0.004943,1.0,0.0
3,190866.0,0.0,0.08,0.0,0.0,1.0,0.0
4,204997.0,0.000898,0.066667,0.4,0.011774,0.8,0.0


In [10]:
users_data.head()

Unnamed: 0,user_id,birthdate,gender,married,has_children,home_country,home_city,home_airport,home_airport_lat,home_airport_lon
0,23557,1958-12-08,F,True,False,usa,new york,LGA,40.777,-73.872
1,94883,1972-03-16,F,True,False,usa,kansas city,MCI,39.297,-94.714
2,101486,1972-12-07,F,True,True,usa,tacoma,TCM,47.138,-122.476
3,101961,1980-09-14,F,True,False,usa,boston,BOS,42.364,-71.005
4,106907,1978-11-17,F,True,True,usa,miami,TNT,25.862,-80.897


In [11]:
# Checking format data:
users_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5998 entries, 0 to 5997
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           5998 non-null   int64  
 1   birthdate         5998 non-null   object 
 2   gender            5998 non-null   object 
 3   married           5998 non-null   bool   
 4   has_children      5998 non-null   bool   
 5   home_country      5998 non-null   object 
 6   home_city         5998 non-null   object 
 7   home_airport      5998 non-null   object 
 8   home_airport_lat  5998 non-null   float64
 9   home_airport_lon  5998 non-null   float64
dtypes: bool(2), float64(2), int64(1), object(5)
memory usage: 386.7+ KB


We have identified a few issues in our dataset that require attention:

1. Convert 'Birthdate' to date format.
2. Calculate the age based on 'Birthdate'.
3. Check the 'Gender' column for the number of unique values and consider converting this data to boolean.


In [12]:
# Convert Birthdate to date format:
users_data['birthdate'] = pd.to_datetime(users_data['birthdate'], errors='coerce')

In [13]:
# Checking unique values in "Gender" column:
users_data['gender'].value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
F,5292
M,695
O,11


 Since amount of O-values is insignificant (only 0.18% of total) we will replace them by most frequent
 value ("F") and convert this column to boolean

In [15]:
# Calculation users age:
users_data['age'] = (pd.Timestamp.now() - users_data['birthdate'])//np.timedelta64(1,'D') / 365.25

# Replacing O-values by "F" in "Gender":
replace_values = {'F' : True, 'O' : True, 'M' : False }
users_data['gender'] = users_data['gender'].map(replace_values)

In [16]:
users_data.head()

Unnamed: 0,user_id,birthdate,gender,married,has_children,home_country,home_city,home_airport,home_airport_lat,home_airport_lon,age
0,23557,1958-12-08,True,True,False,usa,new york,LGA,40.777,-73.872,65.735797
1,94883,1972-03-16,True,True,False,usa,kansas city,MCI,39.297,-94.714,52.465435
2,101486,1972-12-07,True,True,True,usa,tacoma,TCM,47.138,-122.476,51.737166
3,101961,1980-09-14,True,True,False,usa,boston,BOS,42.364,-71.005,43.967146
4,106907,1978-11-17,True,True,True,usa,miami,TNT,25.862,-80.897,45.793292


In [18]:
# We will drop unnecessary columns from the 'users_data' DataFrame to keep only the relevant information:
users_data.drop(['birthdate','home_country',
                 'home_city', 'home_airport',
                 'home_airport_lat', 'home_airport_lon'],
                 axis='columns', inplace=True)
#Removed 'sign_up_date' from the list of columns to drop as it does not exist

In [19]:
# We will merge two datasets in one base on the user ID number:
data = users_data.merge(perks, how='inner')
data.head()

Unnamed: 0,user_id,gender,married,has_children,age,hotel_hunter_index,average_bags_scaled,cancellation_rate_scaled,bargain_hunter_index,combined_booking_scaled,session_intencity_index
0,23557,True,True,False,65.735797,0.00268,0.0,0.0,0.0,0.0,0.0
1,94883,True,True,False,52.465435,0.0,0.1,0.0,0.0,1.0,0.0
2,101486,True,True,True,51.737166,0.0,0.0,0.0,0.0,0.5,0.0
3,101961,True,True,False,43.967146,0.0,0.08,0.0,5.8e-05,1.0,0.0
4,106907,True,True,True,45.793292,0.0,1.0,0.5,0.0,1.0,0.0


In [20]:
data.loc[data.isnull().any(axis=1)]

Unnamed: 0,user_id,gender,married,has_children,age,hotel_hunter_index,average_bags_scaled,cancellation_rate_scaled,bargain_hunter_index,combined_booking_scaled,session_intencity_index


In [21]:

def show_clusters_on_plot(df, x_name, y_name, cluster_name):
    plt.figure(figsize = (10,10))
    sns.scatterplot(df[x_name], df[y_name],
          hue = df[cluster_name], palette = 'Paired'
       )
    plt.title('{} vs {}'.format(x_name, y_name))
    plt.show()

In [22]:
# standardize the data
sc = StandardScaler()
x_sc = sc.fit_transform(data)

In [23]:
km = KMeans(n_clusters=5, random_state=0)
# predict the clusters for observations (the algorithm assigns them a number from 0 to 2)
labels = km.fit_predict(x_sc)

# store cluster labels in the field of our dataset
data['cluster'] = labels
data.sample(10)

  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,user_id,gender,married,has_children,age,hotel_hunter_index,average_bags_scaled,cancellation_rate_scaled,bargain_hunter_index,combined_booking_scaled,session_intencity_index,cluster
2373,532215,True,True,False,49.741273,0.000155,0.12,0.0,0.0,0.833333,0.0,1
1944,525102,True,False,True,37.574264,0.000171,0.1,0.0,0.002076,0.8,0.0,0
1395,517143,True,False,False,26.464066,0.0,0.0,0.0,0.0,0.0,0.0,0
745,508829,True,False,False,17.790554,0.0,0.066667,0.2,0.0,0.8,0.0,3
1540,519313,False,False,True,24.210815,0.0,0.0,0.0,0.0,1.0,0.0,2
2815,540614,True,False,False,32.161533,4.3e-05,0.1,0.2,0.005239,1.0,0.0,3
5323,657591,True,True,False,38.198494,9e-05,0.066667,0.0,0.0,0.75,0.0,1
3975,567448,True,True,True,46.028747,0.0,0.0,0.0,0.0,0.5,0.0,1
2263,530468,True,True,True,38.499658,0.0,0.05,0.0,0.0,1.0,0.0,1
4231,575711,True,True,True,49.415469,0.0,0.2,0.0,0.0,0.666667,0.0,1


In [24]:
mean_by_clusters = data.groupby(['cluster']).mean().reset_index()
mean_by_clusters[['cluster', 'age',
                       'hotel_hunter_index',
                       'average_bags_scaled',
                       'cancellation_rate_scaled',
                       'bargain_hunter_index',
                        'combined_booking_scaled','session_intencity_index']]

Unnamed: 0,cluster,age,hotel_hunter_index,average_bags_scaled,cancellation_rate_scaled,bargain_hunter_index,combined_booking_scaled,session_intencity_index
0,0,38.494014,0.000494,0.103732,0.001925,0.00173,0.724075,0.0
1,1,48.2828,0.000437,0.111686,0.003306,0.001924,0.709474,0.0
2,2,42.634808,0.000502,0.112417,0.017891,0.001936,0.741562,0.0
3,3,40.588967,0.000506,0.149606,0.320394,0.003639,0.759054,0.0
4,4,47.004228,0.06013,0.07451,0.023529,0.003628,0.905882,0.0


In [25]:
median_by_clusters = data.groupby(['cluster']).median().reset_index()
median_by_clusters[['cluster', 'age',
                       'hotel_hunter_index',
                       'average_bags_scaled',
                       'cancellation_rate_scaled',
                       'bargain_hunter_index',
                        'combined_booking_scaled','session_intencity_index']]

Unnamed: 0,cluster,age,hotel_hunter_index,average_bags_scaled,cancellation_rate_scaled,bargain_hunter_index,combined_booking_scaled,session_intencity_index
0,0,39.173169,0.0,0.1,0.0,0.0,0.8,0.0
1,1,48.0,0.0,0.1,0.0,0.0,0.75,0.0
2,2,43.348392,0.0,0.1,0.0,0.0,0.8,0.0
3,3,41.073238,0.0,0.133333,0.333333,0.0,0.75,0.0
4,4,46.932238,0.043262,0.1,0.0,0.0,1.0,0.0


In [26]:
numeric_vars = data[['cluster','age',
                       'hotel_hunter_index',
                       'average_bags_scaled',
                       'cancellation_rate_scaled',
                       'bargain_hunter_index',
                        'combined_booking_scaled','session_intencity_index']].copy()

In [27]:
categorical_vars = data[['gender', 'married', 'has_children', 'cluster']].copy()

In [28]:
for i in numeric_vars.columns:
    fig = px.box(x = numeric_vars['cluster'], y = numeric_vars[i])
    fig.update_layout(title = i, xaxis_title = 'Cluster number', yaxis_title = i)
    fig.show()

In [29]:
for i in numeric_vars.columns:
    fig = px.histogram(numeric_vars, x=i, color='cluster', title=i, barmode='overlay')
    fig.update_layout(yaxis_title='Frequency', xaxis_title=i)
    fig.show()

In [30]:
for i in categorical_vars:
    fig = px.histogram(categorical_vars, x='cluster', color=i, title=i, barmode='overlay')
    fig.update_layout(yaxis_title='Frequency', xaxis_title=i)
    fig.show()

### Demographic characterictis:
**Age:** The youngest group among our customers is in cluster 3, with a median age of 25, followed by cluster 4, where the median age is 39 years. Cluster 2 comprises the oldest customers, with a median age of 47 years

**Gender:** Cluster 0 consists exclusively of male customers, and there is an insignificant number of male users in cluster 1. All other clusters are comprised entirely of female customers

**Marriage:** Cluster 2 is predominantly composed of married customers, while cluster 4 consists entirely of unmarried customers. All other clusters exhibit a mix of marital statuses in varying proportions

**Parental status:** All clusters show a similar distribution of users with and without children, with a higher proportion of users without children across all clusters. Cluster 4 has the highest share of users without children, accounting for 76% of total users in the cluster, followed by clusters 0 and 1, both at 66%


### Indexes by Clusters:

**Cluster 0:** This cluster does not have a leading position in any of the indexes. They perform well in Cancellation Rate (2nd place) and Bargain Hunter Index (3rd place), indicating an interest in perks such as 'No Cancellation Fees' or 'Exclusive Discounts.'

**Cluster 1:** Cluster 1 holds leading positions in multiple indexes, including Average Bags, Cancellation Rate, and Bargain Hunter Index. This suggests their interest lies in both flight discounts and accompanying services.

**Cluster 2:** Cluster 2 excels in the Hotel Hunter Index and ranks 2nd in Bargain Hunter Index, indicating a strong interest in hotel and flight discounts.

**Cluster 3:** This cluster ranks last in most indexes but holds the leading position in Session Activity, showing a keen interest in hunting for discounts.

**Cluster 4:** Cluster 5 leads in the Combined Booking Index, indicating a preference for perks related to combined bookings, such as '1 Free Night Hotel with Flight.'

In summary, we can label our clusters based on their interests in the following perks:

Cluster 0: 'No Cancellation Fees'
Cluster 1: 'Free Checked Bag'
Cluster 2: 'Free Hotel Meal'
Cluster 3: 'Exclusive Discount'
Cluster 4: '1 Free Night Hotel with Fligh'