<a href="https://www.kaggle.com/code/mohamedalaa40123/automated-eda-final?scriptVersionId=143918517" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Import Libraries**

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import plotly.io as pio
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import sqlalchemy as sa
import os
import scipy

# ****Load Data****

In [None]:
def load_data(file_path):
    """Loads data from a file based on the file extension.

    Args:
        file_path: The path to the file to load.

    Returns:
        A Pandas DataFrame containing the data from the file.
    """

    file_extension = os.path.splitext(file_path)[1]

    if file_extension == ".csv":
        df = pd.read_csv(file_path)
    elif file_extension == ".xlsx" or file_extension == ".xls":
        df = pd.read_excel(file_path)
    elif file_extension == ".sql": #this is copied from Bard
        # Create a SQLAlchemy engine to connect to the database 
        engine = sa.create_engine("database://user:password@host:port/database")

        # Read the data from the SQL database into a Pandas DataFrame.
        df = pd.read_sql_query("SELECT * FROM your_table_name", engine)
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")
        
        
    # Set the date column as the index of the DataFrame.
#     try:
#         df['data_dte'] = pd.to_datetime(df['data_dte'])
#         df.set_index('data_dte', inplace=True)
#     except Exception as e:
#         print(e)
        
    return df


# **Explore The Data**

In [None]:
file_path = "/kaggle/input/us-international-air-traffic-data/International_Report_Departures.csv"
# Load and preprocess data
df = load_data(file_path)
df.head()


In [None]:
df.shape

> **The Data has 930808 records and 16 features**

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isna().sum()

# **Let's Find the Top 10**

In [None]:
airliness_by_usairport = df.groupby("usg_apt")["Total"].sum().reset_index()
airliness_by_usairport = airliness_by_usairport.sort_values(by="Total", ascending = False).reset_index(drop=True)
top10_airports = airliness_by_usairport.iloc[:10]
top10_airports

In [None]:
# Create a bar chart from the top 10 busiest US airports.
fig = px.bar(
    top10_airports.sort_values(by='Total'),
    x="Total",
    y="usg_apt",
    title="Top 10 Busiest US Airports",
    hover_name="usg_apt",
    color="Total",
)

fig.show()

Miami and JFK are the busiest Airports in the US

# **Continue Processing the Data and Selecting Features**

In [None]:
def preprocess_data(data, num_imputer_strategy='mean', num_scaler='standard', cat_encoder='one-hot'):
    def identify_categorical_features(data):
        categorical_features = data.select_dtypes(include=['object']).columns
        binary_features = []

        for feature in categorical_features:
            unique_values = data[feature].nunique()
            if unique_values == 2:
                binary_features.append((feature, 'binary'))
            else:
                binary_features.append((feature, 'categorical'))

        return binary_features

    numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = identify_categorical_features(data)

    # Impute missing values for numerical features
    if num_imputer_strategy == 'mean':
        num_imputer = SimpleImputer(strategy='mean')
    elif num_imputer_strategy == 'median':
        num_imputer = SimpleImputer(strategy='median')
    else:
        raise ValueError("Invalid numerical imputer strategy")

    data[numerical_features] = num_imputer.fit_transform(data[numerical_features])

    # Encode categorical features
    if cat_encoder == 'one-hot':
        data = pd.get_dummies(data, columns=[feature for feature, _ in categorical_features])
    else:
        raise ValueError("Invalid categorical encoder")
    return data


# **Functions For Visualization**

In [None]:
def create_scatter_plot(df, feature):
    fig = px.scatter(df, x=feature, y='Total', title=f'Scatter Plot of {feature}')
    return fig

def create_pie_plot(df, feature):
    feature_counts = df[feature].value_counts()
    plt.figure(figsize=(8, 6))
    plt.pie(feature_counts, labels=feature_counts.index, autopct='%1.1f%%', startangle=140)
    plt.axis('equal')
    plt.title(f'Pie Plot of {feature}')
    plt.show()

def create_bar_plot(df, feature):
    plt.figure(figsize=(8, 6))
    sns.countplot(data=df, x=feature)
    plt.title(f'Bar Plot of {feature}')
    plt.xticks(rotation=45)
    plt.show()

def create_histogram(ax, df, feature):
    sns.histplot(df[feature], bins=20, kde=True, ax=ax)
    ax.set_title(f'Histogram of {feature}')
    plt.tight_layout()
    plt.show()

def create_box_plot(ax, df, feature):
    sns.boxplot(x=df[feature], ax=ax)
    ax.set_title(f'Box Plot of {feature}')
    plt.tight_layout()
    plt.show()

def create_visualizations_for_column(df, column_name):
    fig_histogram, ax_histogram = plt.subplots(figsize=(8, 6))
    create_histogram(ax_histogram, df, column_name)

    fig_box_plot, ax_box_plot = plt.subplots(figsize=(8, 6))
    create_box_plot(ax_box_plot, df, column_name)

    fig_scatter_plot = create_scatter_plot(df, column_name)
    fig_pie_plot = create_pie_plot(df, column_name)
    fig_bar_plot = create_bar_plot(df, column_name)

    figures = [fig_histogram, fig_box_plot, fig_pie_plot, fig_bar_plot]
    return figures, fig_scatter_plot

In [None]:
file_path = "/kaggle/input/us-international-air-traffic-data/International_Report_Departures.csv"
# Load and preprocess data
df = load_data(file_path)
preprocessed_df = preprocess_data(df)

# Print all features
print("All Features:")
print(preprocessed_df.columns)

# **Choose a Column**

In [None]:
# Choose a specific column to visualize
column_to_visualize = "Year"

# Generate all types of plots for the chosen column
figures, _ = create_visualizations_for_column(preprocessed_df, column_to_visualize)

In [None]:
# Choose a specific column to visualize
column_to_visualize = "Total"

# Generate all types of plots for the chosen column
figures, _ = create_visualizations_for_column(preprocessed_df, column_to_visualize)

# **Try ydata_profiling**

In [None]:
# from ydata_profiling import ProfileReport 
# profile = ProfileReport(preprocessed_df)


In [None]:
# profile

**Note:** There is different Libraries for faster EDA such as **DataPrep** or **AutoViz**