In [17]:
import pandas as pd
from pandas.errors import ParserError
import csv

def robust_data_profiler(file_name, output_path='data_profile.txt'):
    # check csv extension, option to cancel.
    if file_name[-3:] != 'csv':
        print('The file extension is not .csv. Do you want to proceed?')
        response = input('(Y/N): ')
        if response.lower() == 'n':
            return None  
    
    # pandas read inside of try block
    try:
        df = pd.read_csv(file_name)
    except FileNotFoundError:
        print("File Not Found Error - Check that the filename is valid.")
        return None
    except ParserError:
        print("Parser Error - Check that file contents are in csv format.")
        return None
    except:
        print("Unknown Error - Check file contents.")
        return None

    # build output header
    data_output = 'Data Source: '
    data_output += file_name
    data_output += '\n\n'
    
    # call revised data profiler from session 2
    data_output += data_profiler(df)

    # write results to file, default path 'data_profile.txt'
    with open(output_path, "w") as fs:
        fs.write(data_output)
    
    # return text
    return data_output

def data_profiler(df):                   
    data_output = ''
    
    # Prints shape (rows, columns)
    data_output += 'Shape\n'
    data_output += str(df.shape)

    # Lists column names and dtypes
    data_output += '\n\nColumns\n'
    data_output += str(df.dtypes)

    # Counts missing values per column
    data_output += '\n\nMissing Values\n'
    data_output += str(df.isnull().sum())

    # Shows basic statistics for numeric columns (.describe())
    data_output += '\n\nNumerical Statistics\n'
    data_output += str(df.describe(percentiles=[.10, .25, .50, .75, .90]))

    # Shows unique counts for categorical columns
    data_output += '\n\nCategorical Statistics\n'
    data_output += str(df.describe(include='object'))

    # Shows unique counts for categorical columns
    data_output += '\n\nFirst 5 rows\n'
    data_output += str(df.head())

    # return text
    return data_output



In [18]:
# Test for missing file
missing_file_name = 'does-not-exist.csv'
profile_output = robust_data_profiler(missing_file_name)

File Not Found Error - Check that the filename is valid.


In [19]:
# Test to generate parser error
bad_format_file_name = 'bad-data.txt'
bad_text = """\
Header
Line 1
,Line 2
"""

with open(bad_format_file_name, "w") as fs:
    fs.write(bad_text)

profile_output = robust_data_profiler(bad_format_file_name)

The file extension is not .csv. Do you want to proceed?


(Y/N):  N


In [20]:
# paths for testing
file_name = 'seaborn-data_tips.csv'
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv'

# working function call w/ default output filename.
profile_output = robust_data_profiler(file_name)
print(profile_output)

Data Source: seaborn-data_tips.csv

Shape
(244, 7)

Columns
total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

Missing Values
total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

Numerical Statistics
       total_bill         tip        size
count  244.000000  244.000000  244.000000
mean    19.785943    2.998279    2.569672
std      8.902412    1.383638    0.951100
min      3.070000    1.000000    1.000000
10%     10.340000    1.500000    2.000000
25%     13.347500    2.000000    2.000000
50%     17.795000    2.900000    2.000000
75%     24.127500    3.562500    3.000000
90%     32.235000    5.000000    4.000000
max     50.810000   10.000000    6.000000

Categorical Statistics
         sex smoker  day    time
count    244    244  244     244
unique     2      2    4       2
top     Male     No  Sat