In [None]:
# EDA and Preprocessing for 5G QoS Dataset

import pandas as pd
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('replicated_5g_qos_events_with_device.csv', parse_dates=['Timestamp'])

# Display basic info
print("Dataset Info:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Show unique values for each column
print("\nUnique values per column:")
for column in df.columns:
    unique_values = df[column].nunique()
    sample_values = df[column].unique()[:10]  # Display up to first 10 unique values
    print(f"{column}: {unique_values} unique values, Sample: {sample_values}")

# Plot distributions for categorical variables
categorical_columns = ['Application_Type', 'device_type']

for col in categorical_columns:
    plt.figure(figsize=(10, 6))
    df[col].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--')
    plt.tight_layout()
    plt.show()

# Plot histogram for numerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

for col in numerical_columns:
    plt.figure(figsize=(10, 6))
    df[col].hist(bins=30)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.grid(linestyle='--')
    plt.tight_layout()
    plt.show()

# Time-series distribution
plt.figure(figsize=(12, 6))
df.set_index('Timestamp').resample('D').size().plot()
plt.title('Number of Events per Day')
plt.xlabel('Date')
plt.ylabel('Number of Events')
plt.grid(linestyle='--')
plt.tight_layout()
plt.show()