In [1]:
import pandas as pd

# Specify the file path using a raw string (r)
file_path = r'D:\Data\23-24\Financial\WS_DEBT_SEC2_PUB_csv_col.csv'  # Replace with your file path

# Load the dataset
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Display general information about the dataset
print("\nGeneral Information:")
print(data.info())

# Display the number of missing values in each column
print("\nMissing Values in Each Column:")
print(data.isnull().sum())

# Display the number of duplicated rows
print("\nNumber of Duplicated Rows:")
print(data.duplicated().sum())

# Display summary statistics for numerical columns
print("\nSummary Statistics for Numerical Columns:")
print(data.describe())

# Check if any columns potentially represent dates
date_columns = [col for col in data.columns if 'date' in col.lower() or 'time' in col.lower()]
if date_columns:
    print("\nPotential Date Columns:", date_columns)
else:
    print("\nNo obvious date columns found. You may need to specify or create a time index.")

# If a date column exists, check if it is correctly set as datetime
for date_col in date_columns:
    if not pd.api.types.is_datetime64_any_dtype(data[date_col]):
        try:
            data[date_col] = pd.to_datetime(data[date_col])
            print(f"\nColumn '{date_col}' successfully converted to datetime format.")
        except Exception as e:
            print(f"\nColumn '{date_col}' could not be converted to datetime format. Error: {e}")
    else:
        print(f"\nColumn '{date_col}' is already in datetime format.")

# Check if the date column is sorted chronologically
if date_columns:
    for date_col in date_columns:
        sorted_check = data[date_col].is_monotonic_increasing
        if sorted_check:
            print(f"\nThe data in column '{date_col}' is sorted in chronological order.")
        else:
            print(f"\nThe data in column '{date_col}' is NOT sorted in chronological order. Consider sorting it.")

# Check for unique timestamps in the date column to determine if it is suitable for a time series
if date_columns:
    for date_col in date_columns:
        unique_dates = data[date_col].nunique()
        total_rows = len(data)
        if unique_dates == total_rows:
            print(f"\nEach row in column '{date_col}' has a unique timestamp, suitable for time series.")
        else:
            print(f"\nColumn '{date_col}' has repeated timestamps, which may not be ideal for a time series.")

# Display the number of unique values in each column to understand the nature of categorical data
print("\nUnique Values in Each Column:")
unique_values = {col: data[col].nunique() for col in data.columns}
print(unique_values)


First few rows of the dataset:
  FREQ  Frequency ISSUER_RES                   Issuer residence ISSUER_NAT  \
0    Q  Quarterly         3P  All countries excluding residents         SE   
1    Q  Quarterly         TW                     Chinese Taipei         3P   
2    Q  Quarterly         5R                Developed countries         3P   
3    Q  Quarterly         AM                            Armenia         3P   
4    Q  Quarterly         AM                            Armenia         3P   

                  Issuer nationality ISSUER_BUS_IMM  \
0                             Sweden              1   
1  All countries excluding residents              J   
2  All countries excluding residents              7   
3  All countries excluding residents              B   
4  All countries excluding residents              B   

  Issuer sector - immediate borrower ISSUER_BUS_ULT  \
0                        All issuers              J   
1         Non-financial corporations              1   
2   