## Read Data from CSV and Calculate DQI

**Description**: Read data from a CSV file, identify errors as missing values, and calculate the DQI.

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import unittest

# Function to calculate the Data Quality Index (DQI)
def calculate_dqi(csv_file_path=None, data=None):
    try:
        # If file path is provided, read the CSV file into a DataFrame
        if csv_file_path:
            data = pd.read_csv(csv_file_path)
        
        # Check if data is empty
        if data.empty:
            return "Error: No data available in the file.", 0, 0  # If the DataFrame is empty
        
        # Calculate the total number of values
        total_values = data.size  # total number of elements in the DataFrame
        
        # Calculate the number of missing (NaN) values
        missing_values = data.isnull().sum().sum()  # sum of NaN values across the entire DataFrame
        
        # Calculate DQI
        if total_values == 0:
            return "Error: No data available in the file.", 0, 0  # Return an error if there are no values
        
        dqi = 100 - (missing_values / total_values * 100)
        
        return dqi, missing_values, total_values  # Return DQI, missing values, and total values
    
    except FileNotFoundError:
        return "Error: The specified file was not found.", 0, 0  # Return error with default values
    except pd.errors.ParserError:
        return "Error: There was an issue with parsing the CSV file.", 0, 0  # Return error with default values
    except Exception as e:
        return f"Error: {str(e)}", 0, 0  # Return any other error with default values


# Function to visualize DQI and errors
def visualize_dqi_and_errors(csv_file_path):
    try:
        # Calculate DQI
        dqi, missing_values, total_values = calculate_dqi(csv_file_path)
        
        if isinstance(dqi, str):  # If an error message was returned
            print(dqi)
            return
        
        # Create the bar plot
        fig, ax = plt.subplots(figsize=(8, 6))
        
        # Data for the plot
        labels = ['Data Quality Index (DQI)', 'Missing Values']
        values = [dqi, missing_values]
        
        ax.bar(labels, values, color=['green', 'red'])
        
        # Adding labels and title
        ax.set_ylabel('Percentage / Count')
        ax.set_title('Data Quality Index and Missing Values')
        
        # Display the DQI percentage on the plot
        ax.text(0, dqi + 5, f'{dqi:.2f}%', ha='center', color='black')
        ax.text(1, missing_values + 5, f'{missing_values}', ha='center', color='black')
        
        plt.show()
    
    except FileNotFoundError:
        print("Error: The specified file was not found.")
    except pd.errors.ParserError:
        print("Error: There was an issue with parsing the CSV file.")
    except Exception as e:
        print(f"Error: {str(e)}")


# Unit tests for the functions
class TestDQICalculation(unittest.TestCase):

    def test_valid_dqi(self):
        # Create a sample DataFrame with some missing values
        data = pd.DataFrame({
            'Name': ['John', 'Alice', None, 'Eve'],
            'Age': [29, None, 32, None],
            'Address': ['New York', 'California', 'Texas', 'Florida']
        })
        
        dqi, missing_values, total_values = calculate_dqi(data=data)
        self.assertEqual(dqi, 75.0)  # 3 out of 12 values are missing
        
    def test_missing_file(self):
        result = calculate_dqi(csv_file_path='nonexistent_file.csv')
        self.assertEqual(result[0], 'Error: The specified file was not found.')
        
    def test_empty_file(self):
        # Simulate an empty file by passing an empty DataFrame
        empty_data = pd.DataFrame(columns=['Name', 'Age', 'Address'])
        dqi, missing_values, total_values = calculate_dqi(data=empty_data)
        self.assertEqual(dqi, 'Error: No data available in the file.')

    def test_invalid_csv_format(self):
        # Simulate a bad CSV file (e.g., corrupted)
        result = calculate_dqi(csv_file_path='bad_file.csv')
        self.assertTrue('Error' in result[0])  # Should return an error related to reading the CSV


# Example usage
csv_file_path = 'data.csv'  # Path to your CSV file
dqi, missing_values, total_values = calculate_dqi(csv_file_path=csv_file_path)

if isinstance(dqi, str):  # If an error message was returned
    print(dqi)
else:
    print(f"Data Quality Index (DQI): {dqi}%")
    print(f"Missing values: {missing_values}")
    print(f"Total values: {total_values}")

    # Visualize the DQI and errors
    visualize_dqi_and_errors(csv_file_path)

# To handle the error in Jupyter/IPython, add this condition
if __name__ == '__main__':
    # In Jupyter or IPython, we don't use unittest.main() directly. Use the following:
    unittest.TextTestRunner().run(unittest.defaultTestLoader.loadTestsFromTestCase(TestDQICalculation))

....
----------------------------------------------------------------------
Ran 4 tests in 0.005s

OK


Error: The specified file was not found.


In [1]:
# Write your code from here
import pandas as pd

# Function to calculate the Data Quality Index (DQI)
def calculate_dqi(csv_file_path):
    # Read the CSV file into a DataFrame
    try:
        data = pd.read_csv(csv_file_path)
    except FileNotFoundError:
        return "Error: The specified file was not found."
    
    # Calculate the total number of values
    total_values = data.size  # total number of elements in the DataFrame
    
    # Calculate the number of missing (NaN) values
    missing_values = data.isnull().sum().sum()  # sum of NaN values across the entire DataFrame
    
    # Calculate DQI
    if total_values == 0:
        return "Error: No data available in the file."
    
    dqi = 100 - (missing_values / total_values * 100)
    
    return dqi

# Example usage
csv_file_path = 'data.csv'  # Path to your CSV file
dqi = calculate_dqi(csv_file_path)
print(f"Data Quality Index (DQI): {dqi}%")

Data Quality Index (DQI): Error: The specified file was not found.%


In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import unittest

# Function to calculate the Data Quality Index (DQI)
def calculate_dqi(data):
    try:
        # Calculate the total number of values
        total_values = data.size  # total number of elements in the DataFrame
        
        # Calculate the number of missing (NaN) values
        missing_values = data.isnull().sum().sum()  # sum of NaN values across the entire DataFrame
        
        # Calculate DQI
        if total_values == 0:
            return "Error: No data available in the file."
        
        dqi = 100 - (missing_values / total_values * 100)
        
        return dqi, missing_values, total_values
    except Exception as e:
        return f"Error: {str(e)}"

# Function to visualize DQI and errors
def visualize_dqi_and_errors(csv_file_path):
    try:
        # Read the CSV file into a DataFrame once
        data = pd.read_csv(csv_file_path)
        
        # Calculate DQI
        dqi, missing_values, total_values = calculate_dqi(data)
        
        if isinstance(dqi, str):  # If an error message was returned
            print(dqi)
            return
        
        # Create the bar plot
        fig, ax = plt.subplots(figsize=(8, 6))
        
        # Data for the plot
        labels = ['Data Quality Index (DQI)', 'Missing Values']
        values = [dqi, missing_values]
        
        ax.bar(labels, values, color=['green', 'red'])
        
        # Adding labels and title
        ax.set_ylabel('Percentage / Count')
        ax.set_title('Data Quality Index and Missing Values')
        
        # Display the DQI percentage on the plot
        ax.text(0, dqi + 5, f'{dqi:.2f}%', ha='center', color='black')
        ax.text(1, missing_values + 5, f'{missing_values}', ha='center', color='black')
        
        plt.show()
    
    except FileNotFoundError:
        print("Error: The specified file was not found.")
    except pd.errors.ParserError:
        print("Error: There was an issue with parsing the CSV file.")
    except Exception as e:
        print(f"Error: {str(e)}")

# Unit tests for the functions
class TestDQICalculation(unittest.TestCase):

    def test_valid_dqi(self):
        # Create a sample DataFrame with some missing values
        data = pd.DataFrame({
            'Name': ['John', 'Alice', None, 'Eve'],
            'Age': [29, None, 32, None],
            'Address': ['New York', 'California', 'Texas', 'Florida']
        })
        
        dqi, missing_values, total_values = calculate_dqi(data)
        self.assertEqual(dqi, 75.0)  # 3 out of 12 values are missing
        
    def test_missing_file(self):
        result = calculate_dqi('nonexistent_file.csv')
        self.assertEqual(result, 'Error: The specified file was not found.')
        
    def test_empty_file(self):
        empty_data = pd.DataFrame(columns=['Name', 'Age', 'Address'])
        dqi, missing_values, total_values = calculate_dqi(empty_data)
        self.assertEqual(dqi, 'Error: No data available in the file.')

    def test_invalid_csv_format(self):
        # Simulate a bad CSV file (e.g., corrupted)
        result = calculate_dqi('bad_file.csv')
        self.assertTrue('Error' in result)  # Should return an error related to reading the CSV

# To handle the error in Jupyter/IPython, add this condition
if __name__ == '__main__':
    # In Jupyter or IPython, we don't use unittest.main() directly. Use the following:
    unittest.TextTestRunner().run(unittest.defaultTestLoader.loadTestsFromTestCase(TestDQICalculation))

E.F.
ERROR: test_empty_file (__main__.TestDQICalculation)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_2359/1138308071.py", line 83, in test_empty_file
    dqi, missing_values, total_values = calculate_dqi(empty_data)
ValueError: too many values to unpack (expected 3)

FAIL: test_missing_file (__main__.TestDQICalculation)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_2359/1138308071.py", line 79, in test_missing_file
    self.assertEqual(result, 'Error: The specified file was not found.')
AssertionError: "Error: 'str' object has no attribute 'size'" != 'Error: The specified file was not found.'
- Error: 'str' object has no attribute 'size'
+ Error: The specified file was not found.


----------------------------------------------------------------------
Ran 4 tests in 0.007s

FAILED (failures=1, errors=1)


### Visualize Basic DQI with Bar Plot

**Description**: Create a bar plot for DQI and errors in a dataset.