#### Examples for analytical/tabular methods provided by dsldPy

The goal is for users to apply analytical/tabular methods with simple, intuitive interface. The following functions are included for python:

1. dsldLinear, dsldLogit, and dsldML 
2. dsldTakeALookAround
3. dsldHunting (both C/O hunting functions)
4. dsldFrequencybyS  
5. dsldMatchedAte

In [None]:
## requires R and the dsld (R) package installed
# !pip install dsldPy

In [None]:
# Load necessary libraries

from dsldPy import (
# data reading and preprocessing
preprocess_data, read_data,

# linear/logistic/ML comparisons
dsldPyLinear, dsldPyLinearSummary, dsldPyLinearPredict, dsldPyLinearVcov, dsldPyLinearCoef, dsldPyLinearGetData,
dsldPyLogit, dsldPyLogitSummary, dsldPyLogitPredict, dsldPyLogitVcov, dsldPyLogitCoef, dsldPyLogitGetData,
dsldPyML,

# takeALookAround
dsldPyTakeALookAround, 

# hunting
dsldPyCHunting, dsldPyOHunting, 

# frequency table
dsldPyFrequencybyS,

# causal inference
dsldPyMatchedATE
)

In [None]:
### dsldLinear, dsldLogit, dsldML examples 

### data preprocessing

### all dsldPy functions require a R data frame object as input (NOT pandas dataframe)
### the preprocessing is done by the function preprocess_data
### user needs to manually provide the categorical and numerical features (list)
### the function preprocess_data returns a R data.frame object -> required input for the dsldPy functions

# svcensus data
# Replace with your own path to the svcensus.RData file
# df = read_data("")

# preprocess data
cat_features = ['educ', 'occ', 'gender']
num_features= ['age', 'wageinc', 'wkswrkd']
svcensus = preprocess_data(df, cat_features, num_features)

df_10 = df.head(2)
df_10 = df_10[['age', 'educ', 'occ', 'wkswrkd']]
cat_features = ['educ', 'occ']
num_features = ['age','wkswrkd']
svcensus_comparisons_points = preprocess_data(df_10, cat_features, num_features)

# compas1 data
# Replace with your own path to the compas1.RData file
# df = read_data("")

# preprocess data
cat_features = ["sex", "two_year_recid", "race"]
num_features = ["age","juv_fel_count","decile_score","juv_misd_count","juv_other_count","priors_count","c_jail_in","c_jail_out","c_offense_date","screening_date","in_custody","out_custody"]
compas1 = preprocess_data(df, cat_features, num_features)

df_10 = df.head(2)
df_10 = df_10[["sex", "age","juv_fel_count","decile_score","juv_misd_count","juv_other_count","priors_count","c_jail_in","c_jail_out","c_offense_date","screening_date","in_custody","out_custody"]]
cat_features = ["sex"]
num_features = ["age","juv_fel_count","decile_score","juv_misd_count","juv_other_count","priors_count","c_jail_in","c_jail_out","c_offense_date","screening_date","in_custody","out_custody"]
compas1_comparisons_points = preprocess_data(df_10, cat_features, num_features)

In [None]:
### 1. ------------------------------ dsldPyLinear/dsldPyLogit/dsldPyML ------------------------------

## dsldPyLinear - interactions = True
a = dsldPyLinear(data = svcensus, yName = 'wageinc', sName = 'gender', interactions = True)

### the object a is a list of R objects --- can be accessed using the following functions 
### note that directly looking at 'a' might not be helpful --- use the following functions to access the results and use in python

# uncomment to see the results of the functions
# dsldPyLinearSummary(a)   
# dsldPyLinearCoef(a)
# dsldPyLinearVcov(a)
# dsldPyLinearGetData(a)

# predict()
preds = dsldPyLinearPredict(a, svcensus_comparisons_points)
preds

### can also work with interactions = False as well
a2 = dsldPyLinear(data = svcensus, yName = 'wageinc', sName = 'gender', interactions = False)

# dsldPyLinearSummary(a2)   
# dsldPyLinearCoef(a2)
# dsldPyLinearVcov(a2)
# dsldPyLinearGetData(a2)

## the predict() method requires newData to include S (which is not done)

In [None]:
# dsldPyLogit - interactions = True

b = dsldPyLogit(data = compas1, yName = 'two_year_recid', sName = 'race', interactions = True, yesYVal = "Yes")

### the object b is a list of R objects --- can be accessed using the following functions 
### note that directly looking at 'b' might not be helpful --- use the following functions to access the results and use in python

# uncomment to see the results of the functions
# dsldPyLogitSummary(b)
# dsldPyLogitCoef(b)
# dsldPyLogitVcov(b)
# dsldPyLogitGetData(b)

# predict()
preds = dsldPyLogitPredict(b, compas1_comparisons_points)
preds

### can also work with interactions = False as well
b2 = dsldPyLogit(data = compas1, yName = 'two_year_recid', sName = 'race', interactions = False, yesYVal = "Yes")

# dsldPyLogitSummary(b2)
# dsldPyLogitCoef(b2)
# dsldPyLogitVcov(b2)
# dsldPyLogitGetData(b2)

In [None]:
## dsldPyML - returns testAcc for each sLevel and dataframe (excluding yName and sName) of predictions
### works for several qeML functions as far as I've tried
c = dsldPyML(data = svcensus, yName = 'wageinc', sName = 'gender', qeMLftnName = 'qeKNN',sComparisonPts='rand5')
print(c)

In [None]:
### 2. ------------------------------ dsldTakeALookAround ------------------------------
dsldPyTakeALookAround(data = svcensus, yName = 'wageinc', sName = 'gender', maxFeatureSize = 4)   

In [None]:
### 3. ------------------------------ dsldHunting ------------------------------

# dsldPyCHunting - C-Hunting
a = dsldPyCHunting(data = svcensus, yName = 'wageinc',sName = 'gender')

# # dsldPyOHunting - O-Hunting
b = dsldPyOHunting(data = svcensus, yName = 'wageinc', sName = 'gender')

In [None]:
print(a)

In [None]:
print(b)

In [None]:
### 4. ------------------------------ dsldFrequencybyS ------------------------------
dsldPyFrequencybyS(data = svcensus, cName = 'educ', sName= 'gender')

In [None]:
### 5. ------------------------------ dsldMatchedAte ------------------------------
dsldPyMatchedATE(data = compas1, yName='two_year_recid', sName='race', yesSVal='Caucasian')