In [17]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [18]:
df=pd.read_csv("data/US_Accidents_March23.csv")

In [19]:
# Open a text file in write mode
with open('data_description_output.txt', 'w') as file:
    
    # 1. Number of Rows and Columns
    file.write("Number of Rows and Columns:\n")
    file.write(f"Rows: {df.shape[0]}\n")
    file.write(f"Columns: {df.shape[1]}\n\n")

    # 2. Statistically Important Data
    file.write("Statistical Summary (Numeric Columns):\n")
    file.write(df.describe().to_string() + "\n\n")
    
    file.write("Statistical Summary (All Columns):\n")
    file.write(df.describe(include='all').to_string() + "\n\n")

    # 3. Unique Values of Categorical Variables
    file.write("Unique Values of Categorical Variables:\n")
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    for column in categorical_columns:
        file.write(f"\nColumn: {column}\n")
        file.write(f"Unique Values: {df[column].nunique()}\n")
        file.write("Sample Unique Values: " + ', '.join(map(str, df[column].unique()[:5])) + "\n")

# 4. Distribution of Each Column (Saving plots as images)
print("\nSaving Distribution Plots...")

# Plotting and saving distributions for numerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
for column in numerical_columns:
    plt.figure(figsize=(8, 4))
    plt.hist(df[column].dropna(), bins=30, edgecolor='black')
    plt.title(f"Distribution of {column}")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.savefig(f'{column}_distribution.png')  # Save the plot as an image
    plt.close()

# Plotting and saving distributions for categorical columns
for column in categorical_columns:
    plt.figure(figsize=(8, 4))
    df[column].value_counts().plot(kind='bar', edgecolor='black')
    plt.title(f"Distribution of {column}")
    plt.xlabel(column)
    plt.ylabel("Count")
    plt.savefig(f'{column}_distribution.png')  # Save the plot as an image
    plt.close()

print("Textual output has been saved to 'data_description_output.txt'")
print("Distribution plots have been saved as image files in the current directory.")




Saving Distribution Plots...


: 