# Univariate analysis

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

df = pd.read_csv("../data/02.csv")
pd.set_option("display.max_columns", None)
df.head()

Before engaging in deeper analysis, let's split the variables into categories and quantitative variables.\
The split is based on the number of unique values per variable, and nature of the variable.\
The threshold that comes up is 5:
- those with less than 5 unique values, the variables appear to be categorical
- 5 unique values and more, all the variables appear to be quantitative

In [None]:
df.nunique()

# Categories

In [None]:
cat = df[df.columns[df.nunique() < 5]]
cat

Of **119206 bookings**:
- **37% were canceled**
- **3% repeated guests**
- **86% agent** bookings, 6% company bookings
- **66% city hotel** bookings, 34% resort bookings
- meals: **77% bed&breakfast**, 12% half-breakfast, 10% no meal, 1% full breakfast
- market segment: **47% online travel agent (TA)**, 20% offline TA, 17% groups, 11% direct, 4% corporate, 1% other segments
- distribution channel: **82% TA**, 12% direct, 5% corporate
- **72% reserved rooms were type A**, 16% D, 5% E, 7% other 6 types
- **88% booked without deposit**, 12% with a deposit
- **75% individual (transient) bookings**, 21% transient and associated to another transient booking, 4% other arrangements.

Let's visualise them with countplots to double-check the findings:

In [None]:
def proportion_tables(cat):
    """
    Calculates the proportion of values for each categorical column in the DataFrame and
    formats it into a final DataFrame with percentages and labeled columns.

    Parameters:
    cat (DataFrame): The DataFrame containing the categorical data.

    Returns:
    DataFrame: A DataFrame with one row per variable and columns for the proportions of
               each category, labeled 'no' and 'yes'.
    """
    # Initialize an empty list to store each column's proportion table DataFrame
    proportion_tables_list = []

    # Calculate proportions for each column
    for col in cat.columns:
        # Get the proportion table for the column as a DataFrame
        proportion_table = cat[col].value_counts(normalize=True).rename_axis("Category").reset_index(name="Proportion")
        
        # Add a column with the variable name
        proportion_table["Variable"] = col
        
        # Append this table to the list
        proportion_tables_list.append(proportion_table)

    # Concatenate all the proportion table DataFrames into one
    proportion_tables_df = pd.concat(proportion_tables_list, ignore_index=True)

    # Pivot the table to have one row per variable and columns for percentages of "0" and "1"
    proportion_tables_df_pivoted = proportion_tables_df.pivot(index="Variable", columns="Category", values="Proportion")

    # Multiply by 100 and format as a string with percentage sign
    formatted_percentage = proportion_tables_df_pivoted * 100
    formatted_percentage = formatted_percentage.map("{:.2f}%".format)

    # Rename the columns to 'no' and 'yes' and reset the index
    formatted_percentage.columns = ["no", "yes"]
    proportion_tables_df_final = formatted_percentage.reset_index()

    return proportion_tables_df_final

# Use:
proportion_tables_df_final = proportion_tables(cat)
proportion_tables_df_final

In [None]:
def plot_and_return_categorical_counts(cat, df):
    """
    Creates count plots for each categorical column in the DataFrame and returns a DataFrame containing the count data.
    
    Parameters:
    cat (DataFrame): The DataFrame containing the categorical columns.
    df (DataFrame): The main DataFrame containing all data.
    
    Returns:
    DataFrame: A DataFrame containing count data for each categorical column.
    """
    # Initialize a list to store count data DataFrames
    count_data_list = []
    
    for column in cat:
        plt.figure(figsize=(3, 5))
        countplot = sns.countplot(
            x=column, 
            hue=column, 
            data=df, 
            palette="Set1"
        )

        # Optional customization
        countplot.set_xlabel("Category Value")
        countplot.set_ylabel("Frequency")
        plt.legend(title=column, loc="upper left", bbox_to_anchor=(1, 1))

        # Prepare data for return
        count_data = df[column].value_counts().rename_axis("Category").reset_index(name="Counts")
        count_data["Variable"] = column

        # Annotate each bar with the height (count value)
        for p in countplot.patches:
            count = int(p.get_height())  # Get the bar count as integer
            if count > 0:  # Only annotate bars with a count greater than zero
                countplot.annotate(
                    str(count), 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", 
                    xytext=(0, 10),  # Offset the text by 10 points vertically to put it above the bar
                    textcoords="offset points"
                )

        # Set the y-limit of the plot to make space for the annotations
        plt.ylim(0, max(df[column].value_counts()) * 1.1)  # Set y-limit to 10% above the highest bar

        plt.show()
        
        # Store count data DataFrame in the list
        count_data_list.append(count_data)
    
    # Concatenate all count data into one DataFrame
    all_count_data = pd.concat(count_data_list, ignore_index=True)
    
    return all_count_data

# Use:
all_counts = plot_and_return_categorical_counts(cat, df)
all_counts

# Numerical variables

In [None]:
num = df[df.columns[df.nunique() >= 5]]
num

Of **119206 bookings**:
- rooms were **booked 2 (median) - 3,5 (mean) months ahead**
  - median < mean = more people tend to book closer to the stay-date
  - but some far ahead bookings (2 years) drag the distribution tail to the right
- **peak arrivals** are **mid-june to mid-july**, showing by the:
  - arrival_date_week_number: 27 (mean) - 28 (median),
  - arrival_date_day_of_month: 15 
  - arrival_date_month: 6.6 (mean) - 7 (median)
- bookings include at least **1 weekend day and/or 2 weekdays**
- bookings were made for **2 adults**, no children or babies
- **no history of cancelations**, which aligns with 97% being first-timers
- **car parking space not required**
- **no special requests**
- **daily rate: 95 (median) - 102 (mean)**

**Outliers:**
- 2 years lead time
- 19 weekends
- 50 days
- 55 adults
- 10 children
- 10 babies
- 26 cancelations
- 8 car parking spaces
- 5 special requests
-daily rate: -6.38
- daily rate: 5400

## Normality
### Visual check for normality
The histograms of 11/14 variables show non-normal distribution, right-skewed, with most of the data concentrated on the lower end.
- **arrival_date_week_number**: looks more uniform, but still not normal
- **arrival_date_day_of_month**: fairly uniform
- **arrival_date_month**: roughly symmetrical, but not normal or uniform distributed.

In [None]:
num.hist(figsize=(15, 20), bins=60, xlabelsize=10, ylabelsize=10)

plt.tight_layout()
plt.show()

## Normality tests
- The numerical variables have been the subject to two normality test, starting with the null hypothesis:
  - H0: The sample is derived from a population that follows a normal distribution.
  - at the significance level (alpha) is set at 0.05.

 
- All the p-values from the **Shapiro-Wilk** test are very low, indicating a rejection of the null hypothesis of normality for every variable.
- Likewise, the **Kolmogorov-Smirnov** test results also show p-values of close to zero, leading to the same conclusion.\
With these consistent results from multiple normality tests, along with the high assymetry and and long tails, the evidence strongly suggests that the **data is not normally distributed**.

In [None]:
def test_normality(num, alpha=0.05):
    """
    Tests normality for each numerical column in the DataFrame using the Shapiro-Wilk and 
    Kolmogorov-Smirnov tests, and returns the results in a DataFrame.
    
    Parameters:
    num (DataFrame): DataFrame containing numerical columns to test.
    alpha (float): Significance level used to determine normality.
    
    Returns:
    DataFrame: A DataFrame with the results of the normality tests.
    """
    # Initialize lists to store the results
    columns = []
    shapiro_statistics = []
    shapiro_p_values = []
    shapiro_diagnosis = []
    ks_statistics = []
    ks_p_values = []
    ks_diagnosis = []

    # Define a function to determine the diagnosis based on the p-value
    def normality_diagnosis(p_value):
        return "Normal" if p_value > alpha else "Not normal"

    # Testing normality for each numerical column
    for col in num.columns:
        data = num[col].dropna()  # Exclude missing values from the test

        # Shapiro-Wilk test with sample size check
        if len(data) > 5000:
            data_shapiro = data.sample(5000, random_state=1)
        else:
            data_shapiro = data
        shapiro_stat, shapiro_p = stats.shapiro(data_shapiro)
        shapiro_statistics.append(shapiro_stat)
        shapiro_p_values.append(shapiro_p)
        shapiro_diagnosis.append(normality_diagnosis(shapiro_p))
        
        # Kolmogorov-Smirnov test with standardization
        data_ks = (data - data.mean()) / data.std(ddof=0)
        ks_stat, ks_p = stats.kstest(data_ks, 'norm')
        ks_statistics.append(ks_stat)
        ks_p_values.append(ks_p)
        ks_diagnosis.append(normality_diagnosis(ks_p))
        
        columns.append(col)

    # Create a DataFrame with the results
    normality_test_df = pd.DataFrame({
        "Column": columns,
        "Shapiro Statistic": shapiro_statistics,
        "Shapiro P-value": shapiro_p_values,
        "Shapiro Diagnosis": shapiro_diagnosis,
        "KS Statistic": ks_statistics,
        "KS P-value": ks_p_values,
        "KS Diagnosis": ks_diagnosis
    })

    return normality_test_df

# Use:
normality_test_results = test_normality(num)
display(normality_test_results)

## Shape of the distribution

- **Skewness**:
  - Absolute skewness value [0,2] indicates a distribution with a moderately asymmetric tail for 6/14 columns.
  - **8/14 of the columns exhibit highly assymetric data.**
  - 
- **Kurtosis**:
  - A kurtosis value of around 3 is expected for a normal distribution. **None of the columns have normal distribution**.
  - **9/14 columns have heavy tails or outliers** (high kurtosis >3).
  - Low kurtosis (<3) suggests light tails or lack of outliers, which is case for 5/14 columns.

In [None]:
def analyze_skewness_kurtosis(df, num_columns):
    """
    Calculates skewness and kurtosis for each numerical column in the DataFrame and provides
    a diagnosis based on their values.
    
    Parameters:
    df (DataFrame): The DataFrame containing the data.
    num_columns (list): List of numerical column names to analyze.
    
    Returns:
    DataFrame: A DataFrame with skewness, kurtosis, and their respective diagnoses for each column.
    """
    # Initialize lists to store the results and diagnoses
    skewness = []
    kurtosis = []
    skew_diagnosis = []
    kurt_diagnosis = []
    columns = []

    # Loop through the columns and calculate skew and kurtosis
    for col in num_columns:
        skew = df[col].skew()
        kurt = df[col].kurt()
        columns.append(col)
        skewness.append(skew)
        kurtosis.append(kurt)
        
        # Determine skew diagnosis
        if abs(skew) > 2:
            skew_diagnosis.append("Highly skewed")
        else:
            skew_diagnosis.append("Moderately skewed")

        # Determine kurtosis diagnosis
        if kurt > 3:
            kurt_diagnosis.append("Heavy-tailed")
        elif kurt < 3:
            kurt_diagnosis.append("Light-tailed")
        else:
            kurt_diagnosis.append("Mesokurtic")

    # Create a DataFrame with the results and diagnoses
    skew_kurt_diagnosis_df = pd.DataFrame({
        "Column": columns,
        "Skewness": skewness,
        "Kurtosis": kurtosis,
        "Skew Diagnosis": skew_diagnosis,
        "Kurtosis Diagnosis": kurt_diagnosis
    })

    return skew_kurt_diagnosis_df

# Use:
skew_kurt_results = analyze_skewness_kurtosis(df, num.columns)
skew_kurt_results

# Summary

- univariate analysis:
  - numerical variables:
    - **normality, visual check** (histogram): 11/14 variables show **non-normal distribution**, right-skewed, most data concentrated on the lower end
    - **normality test** (shapiro-wilk, kolmogorov-smirnov): **non-normal distribution** (all the p-values < 0.05)
    - **shape of distribution**:
      - skewness: 8/14 variables exhibit **highly assymetric data**, 6/14 moderately assymetric
      - kurtosis: 9/14 variables have **heavy tails or outliers**, 5/14 light tails or lack of outliers
---

Next: notebook_03_bivariate_analysis