In [7]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from math import ceil
from plotly.subplots import make_subplots
import statsmodels.tsa.stattools as stattools

In [8]:
df = pd.read_csv('../data/processedData/deleted_na.csv')
df.head()

Unnamed: 0,from_airport_code,dest_airport_code,aircraft_type,airline_number,airline_name,duration,stops,price
0,25,54,4228,120,3487,2960,3,7255.0
1,14,3,6662,120,3727,940,1,917.0
2,28,62,4726,94,3633,670,2,502.0
3,6,59,7836,120,2694,1998,2,3222.0
4,21,7,6175,120,1450,1685,2,1363.0


In [9]:
corr = df.corr()

ax_names = [f"{v}" for v in corr.columns]
corr.index = ax_names
corr.columns = ax_names

mask = np.triu(np.ones_like(corr, dtype=bool))

fig_c = go.Figure(go.Heatmap(z=corr.mask(mask),
                            x=ax_names,
                            y=ax_names,
                            colorscale=px.colors.diverging.RdBu,
                            reversescale=True,
                            zmin=-1,
                            zmax=1,
                            hoverongaps=False,
                            hovertemplate='Correlation: %{z:.2f}<br>'))

fig_c.update_layout(xaxis=dict(scaleanchor="y"), yaxis_autorange='reversed')
fig_c.update_layout(autosize=False, width=1000, height=1000,
                    font=dict(size=18), hoverlabel=dict(font=dict(size=18)))

fig_c.show()

In [10]:
values = corr.values
rows, cols = np.where(((np.abs(values) > 0.4) & (np.abs(values) < 1)))

idxs = np.where(rows < cols)
rows = rows[idxs]
cols = cols[idxs]

index = pd.MultiIndex.from_arrays([corr.index[rows], corr.columns[cols]])

result = pd.DataFrame(data=[corr.iloc[x, y] for x, y in zip(rows, cols)], index=index, columns=['Correlation'])
result

Unnamed: 0,Unnamed: 1,Correlation
airline_number,stops,0.416014
duration,stops,0.55266
duration,price,0.429415


## Cross correlation plots

In [11]:
max_lag = 800
price_na = df.loc[:, 'price'].isna()

In [12]:
# Create a mask to handle missing values in 'price' column
price_na = df['price'].isna()

# Calculate cross-correlation for each column with 'price'
ccf_dict = {}
for col in df.columns:
    if col != 'price':
        # Create a mask to handle missing values in the current column
        col_na = df[col].isna()
        
        # Combine masks to exclude rows with missing values in either 'price' or the current column
        mask = ~(price_na | col_na)
        
        # Calculate cross-correlation function (ccf) between 'price' and the current column
        ccf = stattools.ccf(df.loc[mask, 'price'], df.loc[mask, col])
        
        # Store the ccf values in a dictionary
        ccf_dict[col] = ccf

# Calculate the number of rows and columns for subplots
n_cols = 2
n_rows = ceil(len(ccf_dict) / n_cols)

# Create subplots with titles for each column and adjust spacing
fig_a = make_subplots(rows=n_rows, cols=n_cols,
                      subplot_titles=list(ccf_dict.keys()) + [''] * (n_rows * n_cols - len(ccf_dict)),
                      horizontal_spacing=0.04, vertical_spacing=0.03)

# Iterate through columns and add scatter trace for each ccf
for idx, (col, ccf) in enumerate(ccf_dict.items()):
    # Add a scatter trace to the subplot for the current ccf
    fig_a.add_trace(go.Scatter(x=5 * np.arange(max_lag), y=ccf[:max_lag],
                               name=col,
                               mode='lines'),
                    row=(idx // n_cols) + 1,
                    col=(idx % n_cols) + 1)

# Add a vertical line at x=300 to the subplots
fig_a.add_shape(dict(type="line", x0=300, x1=300, y0=-1, y1=1, line=dict(color="red", width=2)))

# Update layout and show the plot
fig_a.update_layout(legend=dict(orientation="h", yanchor="bottom", xanchor="left", y=1.01, x=0),
                    font=dict(size=18),
                    height=4000)
fig_a.update_traces(opacity=0.7)
fig_a.show()