In [1]:
#import bibliotek
pip install mysql-connector-python

import mysql.connector as sql
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import numpy as np

#Połączenie z bazą danych
username = 'nick'
password = 'haslo'
host = 'localhost'
database = 'airlines'
port = port

#Stworzenie zmiennej engine
url = f"postgresql://{username}:{password}@{host}:{port}/{database}"
engine = sqlalchemy.create_engine(url)



#Implementacja metody
def read_sql_table(table):
    df = pd.read_sql(f"SELECT * FROM {table}", url)
    return df



#Wczytywanie ramki
flight_df_raw = read_sql_table('flight')

#Czyszczenie ramki
flight_df_raw = flight_df_raw[(flight_df_raw['year'] != 2020) & (flight_df_raw['cancelled'] == 0)]


#Zmiana nazwy kolumny
flight_df_raw.rename(columns={'dep_delay_new': 'dep_delay'}, inplace=True)
flight_df_raw

#Sprawdzenie lotow z 2020
flight_df_year_test = flight_df_raw.loc[flight_df_raw['year'] == 2020].shape[0]
assert flight_df_year_test == 0, 'The `flight_df` frame still contains flights from 2020'

#Sprawdzenie lotów anulowanych
flight_df_cancelled_test = flight_df_raw.loc[flight_df_raw['cancelled'] != 0].shape[0]
assert flight_df_cancelled_test == 0, 'There are still canceled flights in the `flight_df` frame'



#Analiza kolumny `dep_delay` 

# Ilość
count_dep_delay = round(flight_df_raw['dep_delay'].count(),2)

#średnia
mean_dep_delay = round(flight_df_raw['dep_delay'].mean(),2)
mean_dep_delay

#mediana
median_dep_delay = round(flight_df_raw['dep_delay'].median(),2)
median_dep_delay

#odchylenie
std_dep_delay = round(flight_df_raw['dep_delay'].std(),2)
std_dep_delay

#min
min_dep_delay = round(flight_df_raw['dep_delay'].min(), 2)
min_dep_delay

#max
max_dep_delay = round(flight_df_raw['dep_delay'].max(), 2)
max_dep_delay

percentiles = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]

#percentyle
perc_dep_delay = round(flight_df_raw['dep_delay'].quantile([0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]), 2)
perc_dep_delay

# Zapisanie wyników w ramce danych
dep_delay_statistics_df = pd.DataFrame({
    'Statistics': ['count', 'mean', 'median', 'std', 'min', 'max'] + [f'percentile {p*100}%' for p in percentiles],
    'Value': [count_dep_delay, mean_dep_delay, median_dep_delay, std_dep_delay, min_dep_delay, max_dep_delay] + list(perc_dep_delay)
})

#Wykres dla kolumny dep_delay
plt.figure(figsize=(15,10))
 
x = plt.hist(flight_df_raw['dep_delay'],bins=range(0,200,10), edgecolor='black')
plt.ylabel('Number of flights')
plt.xlabel('Departure delay [min]')
plt.title('Histogram of flight delays', fontweight='bold')
plt.xticks(range(0, 200, 10))
plt.xlim(left=0)
plt.show()

#Wykres dla dep_delay używając warunku dep_delay > 0
filtered_flight_df = flight_df_raw[flight_df_raw['dep_delay'] != 0]

plt.figure(figsize=(15,10))
x = plt.hist(filtered_flight_df['dep_delay'],bins=range(0,200,10), edgecolor='black')
plt.ylabel('Number of flights')
plt.xlabel('Departure delay [min]')
plt.title('Histogram of flight delays', fontweight='bold')
plt.xticks(range(0, 200, 10))
plt.xlim(left=0)
plt.show()

#Wykres dla dep_delay używając warunków dep_delay > 0 oraz dep_delay < percentile 95%
filtered_flight_df = filtered_flight_df[filtered_flight_df['dep_delay'] <= filtered_flight_df['dep_delay'].quantile(0.95)]
plt.figure(figsize=(15,10))
x = plt.hist(filtered_flight_df['dep_delay'],bins=range(0,200,10), edgecolor='black')
plt.ylabel('Number of flights')
plt.xlabel('Departure delay [min]')
plt.title('Histogram of flight delays', fontweight='bold')
plt.xticks(range(0, 200, 10))
plt.xlim(left=0)
plt.show()



#Analiza opóźnień

#Stworzenie nowej kolumny
flight_df_raw['is_delayed'] = flight_df_raw['dep_delay'] > 15

# Procent opóźnionych lotów
total_flights = len(flight_df_raw)
delayed_flights = flight_df_raw['is_delayed'].sum()
delayed_ratio = round((delayed_flights / total_flights), 2)
print(f"{delayed_ratio}%")



#Opóźnienia a miesiąc kalendarzowy

# Stworzenie zmiennej flight_delays_by_month_df
flight_delays_by_month_df = flight_df_raw.groupby('month')['is_delayed'].mean()

#Wykres dla danych z ramki flight_delays_by_month_df
# Generowanie wykresu
plt.figure(figsize=(15, 10))
plt.bar(flight_delays_by_month_df.index, flight_delays_by_month_df.values, edgecolor='black')
plt.xlabel('Month')
plt.ylabel('Percentage of delays')
plt.title('Percentage of delays depending on the month', fontweight='bold')
plt.xticks(range(1, 13), ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], rotation=45)
plt.ylim(0, 1)  # Oś Y od 0 do 1
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()



#Opóźnienie a dzień tygodnia

# Stworzenie zmiennej flight_delays_by_weekday_df
flight_delays_by_weekday_df = flight_df_raw.groupby('day_of_week')['is_delayed'].mean()

# Generowanie wykresu
plt.figure(figsize=(15, 10))
plt.bar(flight_delays_by_weekday_df.index, flight_delays_by_weekday_df.values, edgecolor='black')
plt.xlabel('Day of the week')
plt.ylabel('Percentage of delays')
plt.title('Percentage of delays depending on the day of the week', fontweight='bold')
plt.xticks(range(1, 8), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], rotation=45)
plt.ylim(0, 1)  # Oś Y od 0 do 1
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()



#Dodanie nowej kolumny
flight_df_raw['is_weekend'] = flight_df_raw['day_of_week'].isin([6, 7])

# Wyznaczenie odsetka opóźnień w zależności od weekendu
flight_delays_by_weekend_df = round(flight_df_raw.groupby('is_weekend')['is_delayed'].mean(), 2)

#Wykes na podstawie danych z ramki flight_delays_by_weekend_df
# Generowanie wykresu
plt.figure(figsize=(12, 8))
plt.bar(flight_delays_by_weekend_df.index, flight_delays_by_weekend_df.values, edgecolor='black')
plt.xlabel('Day type')
plt.ylabel('Percentage of delays')
plt.title('Percentage of delays depending on the day type', fontweight='bold')
plt.xticks([0, 1], ['Weekdays', 'Weekend'], rotation=45)
plt.ylim(0, 1)  # Oś Y od 0 do 1
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()

print("The percentage of delays depends on whether the flight occurred on a weekend.")



#Opóźnienie a odległość lotu

#Analiza dla kolumny distance

#wyznacz ilość
count_distance = round(flight_df_raw['distance'].count(),2)

#średnia
mean_distance = round(flight_df_raw['distance'].mean(),2)

#mediana
median_distance = round(flight_df_raw['distance'].median(),2)

#odchylenie
std_distance = round(flight_df_raw['distance'].std(),2)

#min
min_distance = round(flight_df_raw['distance'].min(), 2)

#max
max_distance = round(flight_df_raw['distance'].max(), 2)

#percentyle
percentiles = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
perc_distance = round(flight_df_raw['distance'].quantile([0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]), 2)

# Zapisanie wyników w ramce danych
flight_distance_analysis_df = pd.DataFrame({
    'Statistics': ['count', 'mean', 'median', 'std', 'min', 'max'] + [f'percentile {p*100}%' for p in percentiles],
    'Value': [count_distance, mean_distance, median_distance, std_distance, min_distance, max_distance] + list(perc_distance)
})

#Losowe 10 tysięcy wierszy
random_sample = flight_df_raw.sample(n=10000, random_state=1)

# Wykres punktowy
plt.figure(figsize=(15, 10))
plt.scatter(random_sample['distance'], random_sample['dep_delay'], alpha=0.5)
plt.xlabel('Flight distance [mile]')
plt.ylabel('Departure delay [min]')
plt.title('Dependency between Departure Delay and Flight Distance', fontweight='bold')
plt.grid(alpha = 0.5)
plt.show()

#Unięcie z ramki wierszy, dla których distance jest powyżej 95% percentyla (> 95%)
flight_df_raw = flight_df_raw[flight_df_raw['distance'] <= flight_df_raw['distance'].quantile(0.95)]



#Agregacja zmiennej `distance` oraz odsetek opóźnień

# Określenie granic koszyków co 100 mil
bin_width = 100

max_distance = 2500
bins = list(range(0, max_distance + 1, bin_width))

# Agregacja danych
flight_df_raw['distance_agg'] = pd.cut(flight_df_raw['distance'], bins=bins, labels=[f'{x}-{x+bin_width-1}' for x in bins[:-1]], ordered=False)
flight_delays_by_distance_agg_df = flight_df_raw.groupby('distance_agg')['dep_delay'].mean().reset_index()

flight_df_raw['distance_agg'].unique().sort_values()

# Wykres słupkowy używając danych zapisanych w flight_delays_by_distance_agg_df
plt.figure(figsize=(15, 10))
plt.bar(flight_delays_by_distance_agg_df['distance_agg'], flight_delays_by_distance_agg_df['dep_delay'])
plt.xlabel('Flight distance [Mile]')
plt.ylabel('Average departure delay')
plt.title('Delays depending on flight distance', fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()

assert 'distance_agg' in flight_df_raw.columns, 'Nie odnaleziono kolumny distance_agg w ramce flight_df'



#Opóźnienie a grupa odległości

# Agregacja danych
flight_distance_by_distance_group = flight_df_raw.groupby('distance_group')['distance'].agg(['min', 'max'])
flight_distance_by_distance_group = flight_distance_by_distance_group.reset_index()

flight_distance_by_distance_group

# Odsetek opóźnienia w każdym koszyku
flight_delays_by_distance_group_df = flight_df_raw.groupby('distance_group')['dep_delay'].mean()
flight_delays_by_distance_group_df = flight_delays_by_distance_group_df.reset_index()

# Generowanie wykresu przy użyciu ramki flight_delays_by_distance_group_df
plt.figure(figsize=(15, 10))
plt.bar(flight_delays_by_distance_group_df['distance_group'], flight_delays_by_distance_group_df['dep_delay'])
plt.xlabel('Distance group')
plt.ylabel('Average departure delay')
plt.title('Delays depending on distance group', fontweight='bold')
plt.grid(axis='y',linestyle='--', alpha = 0.5 )
plt.xticks(range(0, 11, 1))
plt.show()

#Zapisanie ramki danych
flight_df_raw.to_csv(r"../data/raw/flight_df_01.csv", index=False)