In [3]:
import numpy as np
import pandas as pd
from collections import defaultdict
import getopt, sys


cth = 15
sth = 60
dlm = ","
MISSING_VALUES = ["", " ", "N/A", "#N/A", "nan"]

ifile_name = "./in_data/R4642-1-COHORT_PA_RECOMMENDER2_new.txt"
ofile_name = "./eda_results/R4642-1-COHORT_PA_RECOMMENDER2_new_profiler.csv"

# ifile_name = "./in_data/R4642-5-Call_Data.txt"
# ofile_name = "./eda_results/R4642-5-Call_Data_profiler.csv"

try:
    ofile = open(ofile_name, 'w')
    df = pd.read_csv(ifile_name, na_values=MISSING_VALUES, sep= '|', low_memory = False)
except:
    print('Parameter Error\n')
    sys.exit(2)

ofile.write("Input File Name," + ifile_name)
ofile.write("\nProfile File Name," + ofile_name)
ofile.write("\nNote\nAll blanks; N/A; #N/A will be treated as missing values")
ofile.write("\nAll statistics are computed on observed values")
ofile.write("\nNumeric columns with not more than %d unique values will be considered as categorical" % cth)
ofile.write("\nCharacter columns with more than %d unique values will be considered as string" % sth)


class Numeric:
    def __init__(self, series):
        self.valid_list = [x for x in series if ~np.isnan(x)]
        self.missing = len(series) - len(self.valid_list)
        self.observed = len(self.valid_list)
        self.mean = np.mean(self.valid_list)
        self.std = np.std(self.valid_list)
        self.min = np.min(self.valid_list)
        self.max = np.max(self.valid_list)
        self.p5 = np.percentile(self.valid_list, 5)
        self.p25 = np.percentile(self.valid_list, 25)
        self.p50 = np.percentile(self.valid_list, 50)
        self.p75 = np.percentile(self.valid_list, 75)
        self.p95 = np.percentile(self.valid_list, 95)


class Categorical:
    def __init__(self, series):
        self.valid_list = [x for x in series if pd.notnull(x)]
        self.missing = len(series) - len(self.valid_list)
        self.observed = len(self.valid_list)
        self.num_categ = len(set(self.valid_list))
        self.cnt_categ = defaultdict(float)
        for each in self.valid_list:
            self.cnt_categ[each] += 1
            

(rows, cols) = df.shape
ofile.write("\n\nData Shape\nRows," + str(rows) + "\nColumns," + str(cols))
DATA_TYPE = {}
for column_name in df.columns:
    if df[column_name].dtype == "object":
        df[column_name] = df[column_name].str.strip()
        if df[column_name].nunique() > sth:
            DATA_TYPE[column_name] = "String/Text"
        else:
            DATA_TYPE[column_name] = "Categorical"
    elif len([x for x in pd.unique(df[column_name].ravel()) if ~np.isnan(x)]) < cth:
        DATA_TYPE[column_name] = "Categorical"
    else:
        DATA_TYPE[column_name] = "Numeric (int64)" if df[column_name].dtype == 'int64' else "Numeric (float64)"

ofile.write(
    "\n\nNumeric variables\nVariable, #Records, #Missing, #Observed, Mean, StdDev, Min, Max, Percentile_5, Percentile_25, Percentile_50, Percentile_75, Percentile_95")
for column_name in DATA_TYPE.keys():
    if DATA_TYPE[column_name][:7] == "Numeric":
        temp = Numeric(df[column_name])
        output = [column_name, rows, temp.missing, temp.observed, temp.mean, temp.std, temp.min, temp.max, temp.p5,
                  temp.p25, temp.p50, temp.p75, temp.p95]
        ofile.write("\n" + ",".join(map(str, output)))
        

ofile.write(
    "\n\nCategorical variables\nVariable, #Records, #Missing, #Observed, #Categories, %C1, %C2, %C3, %C4, %C5, %C6, %C7, %C8")
for column_name in DATA_TYPE.keys():
    if DATA_TYPE[column_name][:11] == "Categorical":
        temp = Categorical(df[column_name])
        output = [column_name, rows, temp.missing, temp.observed, temp.num_categ]
        unord_list = []
        for each in temp.cnt_categ.keys():
            unord_list.append((each, round(temp.cnt_categ[each] / temp.observed, 4)))
        ord_list = sorted(unord_list, key=lambda x: x[1], reverse=True)
        for each in ord_list[:8]:
            output.append(str(each[0]) + " # " + str(each[1]))
        ofile.write("\n" + ",".join(map(str, output)))
        

ofile.write(
    "\n\nString/Text variables\nVariable, #Records, #Missing, #Observed, #Categories")  # , %C1, %C2, %C3, %C4, %C5, %C6, %C7, %C8")
for column_name in DATA_TYPE.keys():
    if DATA_TYPE[column_name][:11] == "String/Text":
        temp = Categorical(df[column_name])
        output = [column_name, rows, temp.missing, temp.observed, temp.num_categ]
        ofile.write("\n" + ",".join(map(str, output)))

ofile.close()