In [1]:
# This is an example of how SASpy can be used within python to predict potential donors from US Census data.


# The dataset for this project originates from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Census+Income). The datset was donated by Ron Kohavi and Barry Becker, after being published in the article _"Scaling Up the Accuracy of Naive-Bayes Classifiers: A Decision-Tree Hybrid"_. You can find the article by Ron Kohavi [online](https://www.aaai.org/Papers/KDD/1996/KDD96-033.pdf). The data we investigate here consists of small changes to the original dataset, such as removing the `'fnlwgt'` feature and records with missing or ill-formatted entries.



In [2]:
#Import necessary libraries for this project
import saspy
import pandas as pd
from time import time
from IPython.display import display
from IPython.display import HTML

In [3]:
# starting the SAS session
sas = saspy.SASsession(cfgname='autogen_winlocal')

SAS Connection established. Subprocess id is 27036



In [4]:
#loading the census data.

#The foloowing method below display how to read a csv file using pandas and then reading in
#the data frome into a SAS object
#cen_data0_pd = pd.read_csv("C:\\Users\\negraj\\Documents\\Learning\\SAS and SASPy\\SASPy-Examples\\census.csv")
#cen_data0 = sas.df2sd(cen_data0_pd) # the short form of: hr = sas.dataframe2sasdata(hr_pd)

#You can also read in a dataset directly into a SAS object using the code below.
cen_data0 = sas.read_csv("C:\\Users\\negraj\\Documents\\Learning\\SAS and SASPy\\SASPy-Examples\\census.csv")


In [5]:
#Display the first record to verify data was read in
#display(cen_data0_pd.head(n=1))
print("First value is:")
display(cen_data0.head(obs=1))

First value is:


Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K


In [6]:
#Data Exploration using SASpy
#display the amount of observations within your data
records = cen_data0.obs()

In [7]:
#Number of records where individuals income is more than $50,000
# I am first going to turn our SAS data object into a pandas data frame to utilize
#value counts. Using value counts allows me to count distnict values very easily
cen_data0_pd = cen_data0.to_df()

In [8]:
#Using value counts to count ditninct values in the data set. A quick print of the value displays the
# ditinct value and its count. The first value returned by value counts is values x <= 50K
# the second value is values x > 50K
greater_50k = cen_data0_pd['income'].value_counts()[1]
_50k_or_less = cen_data0_pd['income'].value_counts()[0]

In [9]:
# Determining the percent of individuals whose income is greater than 50k
greater_percent = (greater_50k/(greater_50k+_50k_or_less))*100

In [10]:
print("Total number of records: {}".format(records))
print("Individuals making more than $50,000: {}".format(greater_50k))
print("Individuals making at most $50,000: {}".format(_50k_or_less))
print("Percentage of individuals making more than $50,000: {}%".format(greater_percent))


Total number of records: 45222
Individuals making more than $50,000: 11208
Individuals making at most $50,000: 34014
Percentage of individuals making more than $50,000: 24.78439697492371%


In [11]:
#----------------------Part 2------------------------------
# Featureset Exploration
#Transforming Skewed continous features
#You need to use the content function to view the column number to then use it in the histogram function later

In [12]:
display(cen_data0.contents())

{'Attributes':       Member               Label1                     cValue1       nValue1  \
 0  WORK._CSV        Data Set Name                   WORK._CSV           NaN   
 1  WORK._CSV          Member Type                        DATA           NaN   
 2  WORK._CSV               Engine                          V9           NaN   
 3  WORK._CSV              Created         05/23/2019 13:47:52  1.874238e+09   
 4  WORK._CSV        Last Modified         05/23/2019 13:47:52  1.874238e+09   
 5  WORK._CSV           Protection                         NaN           NaN   
 6  WORK._CSV        Data Set Type                         NaN           NaN   
 7  WORK._CSV                Label                         NaN           NaN   
 8  WORK._CSV  Data Representation                  WINDOWS_64           NaN   
 9  WORK._CSV             Encoding  wlatin1  Western (Windows)           NaN   
 
                  Label2 cValue2  nValue2  
 0          Observations   45222  45222.0  
 1             V