## Separate into a new dataframe, relevant variables for clustering analysis

In [None]:
from scipy import stats
correlation_coefficient, p_value = stats.pointbiserialr(df['DepDel15'].replace(np.NaN, 0), df['FlightsSameTime80'])

print(correlation_coefficient)
print(p_value)

In [None]:
contingency_table = pd.crosstab(df['DepDel15'], df['FlightsSameTime80'])

# Perform chi-squared test
chi2, p, _, _ = chi2_contingency(contingency_table)

print(f"Chi-squared value: {chi2}")
print(f"P-value: {p}")

In [None]:
import statsmodels.api as sm
contingency_table = pd.crosstab(df['FlightsSameTime80'], df['DepDel15'])

# Perform z-test for proportions
_, p_value = sm.stats.proportions_ztest(contingency_table.values[1], contingency_table.sum(axis=0))

In [None]:
bin_width = 250

max_distance = df['Distance'].max()
num_bins = int(np.ceil(max_distance / bin_width))

distance_bins = np.arange(0, (num_bins + 1) * bin_width, bin_width)

df['distanceBins'] = pd.cut(df['Distance'], bins=bins, labels=bins[:-1])

In [None]:
df_cluster = df[['DayOfMonth', 
'DayOfWeek',
'Reporting_Airline',
'Origin', 
'Dest',
'distanceBins',
'DepHour',
'FlightsSameTime80',
'DepDel15']]

In [None]:
from prince import MCA
# Make sure 'DepDel15' is treated as a categorical variable
df_cluster['DepDel15'] = df_cluster['DepDel15'].astype('category')
categorical_columns = df_cluster.select_dtypes(include=['object', 'category'])

# Create an MCA object and fit it to the df_cluster
mca = MCA()
mca.fit(categorical_columns)

# Transform the df_cluster into MCA coordinates
mca_coordinates = mca.transform(categorical_columns)

# Access the results
eigenvalues = mca.eigenvalues_
explained_var_ratio = mca.explained_inertia_

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o')
plt.xlabel('Dimension')
plt.ylabel('Eigenvalue')
plt.title('Scree Plot')
plt.grid(True)