In [12]:

%matplotlib inline

# import packages

import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import numpy as np
import sqlite3
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from pandas.io import sql
from matplotlib.ticker import ScalarFormatter

# load the raw data

# Select only the relevant columns
pop_cols = ['AGEP','SEX','HISP','POBP','RAC1P','SCIENGP','SOCP']


In [13]:
df = pd.read_csv("/Master/SEM 4/Project/Illinois/csv_pil/ss13pil.csv")

In [14]:
if np.min(df['SEX']) > 0: #ensures that code won't be run if it's been recoded already
  df['SEX'] = df['SEX'] - 1
# Next, create a new column with Male/Female labels
oldNewMap = {0: "Male", 1: "Female"}
df['sex_recode'] = df['SEX'].map(oldNewMap)

# Recode race
# I will be using different categories than are used in the census data.
# All individuals of Hispanic origin will be categorized as Hispanic.
# Non-Hispanic White, Non-Hispanic Black and Asian will be included as categories.
# All other races are coded as "Other"
def race_recode(row):
  if row['HISP'] > 1:
    return "Hispanic"
  elif row['RAC1P'] == 1:
    return "White"
  elif row['RAC1P'] == 2:
    return "African/American"
  elif row['RAC1P'] == 6:
    return "Asian"
  else:
    return "Other"
df['race_recode'] = df.apply(race_recode, axis=1)

# recode the HISP variable for easy readability
oldNewMap = {1: "Not Spanish/Hispanic/Latino", 2: "Mexican", 3: "Puerto Rican", 4: "Cuban", 
             5: "Dominican", 6: "Costa Rican", 7: "Guatemalan", 8: "Honduran", 9: "Nicaraguan",
            10: "Panamanian", 11: "Salvadorian", 12: "Other Central American", 13: "Argentinian",
            14: "Bolivian", 15: "Chilean", 16: "Colombian", 17: "Ecuadorian", 18: "Paraguayan",
            19: "Peruvian", 20: "Uruguayan", 21: "Venezuelan", 22: "Other South American",
            23: "Spaniard", 24: "All Other Spanish/Hispanic/Latino"}
df['detailed_hispanic_origin'] = df['HISP'].map(oldNewMap)

In [15]:
oldNewMap = {1: 1, 2: 0}
df['science_degree'] = df['SCIENGP'].map(oldNewMap)
df['science_degree'].fillna(value=0,inplace=True) # map doesn't include NA values, so they must be filled with zeroes

# Create STEM occupation outcome variable

science_job_codes = ['113021','119041','119121','151111','151121','151122','151131','151132','151133',
                          '151134','151141','151142','151143','151151','151152','151199','152011','152021',
                          '152031','152041','152099','171021','171022','172011','172021','172031','172041',
                          '172051','172061','172071','172072','172081','172111','172112','172121','172131',
                          '172141','172151','172161','172171','172199','173012','173013','173019','173021',
                          '173022','173023','173024','173025','173026','173027','173029','173031','191011',
                          '191012','191012','191021','191022','191023','191029','191031','191032','191041',
                          '191042','191099','192011','192012','192021','192031','192032','192041','192042',
                          '192043','192099','194011','194021','194031','194041','194051','194091','194092',
                          '194093','251021','251022','251032','251041','251042','251043','251051','251052',
                          '251053','251054','414011','419031']
df['science_occupation'] = df['SOCP'].isin(science_job_codes).astype(int)

In [6]:
df.pivot_table(index='detailed_hispanic_origin',values='science_degree',aggfunc='count').sort_values(ascending=False)

detailed_hispanic_origin
Not Spanish/Hispanic/Latino          112598
Mexican                               12148
Puerto Rican                           1476
All Other Spanish/Hispanic/Latino       311
Guatemalan                              266
Cuban                                   217
Colombian                               196
Ecuadorian                              159
Spaniard                                126
Honduran                                 99
Peruvian                                 93
Salvadorian                              78
Dominican                                61
Argentinian                              46
Bolivian                                 43
Nicaraguan                               35
Venezuelan                               29
Chilean                                  25
Costa Rican                              20
Panamanian                               15
Other Central American                   14
Uruguayan                                 7
Other S

In [16]:

# compare Hispanic origins by rates of science degrees
df.pivot_table(index='detailed_hispanic_origin',values='science_degree',aggfunc='mean').sort_values(ascending=False)

detailed_hispanic_origin
Uruguayan                            0.285714
Chilean                              0.200000
Panamanian                           0.200000
Argentinian                          0.173913
Costa Rican                          0.150000
Colombian                            0.137755
Cuban                                0.133641
Peruvian                             0.118280
Bolivian                             0.116279
Ecuadorian                           0.088050
Spaniard                             0.087302
Not Spanish/Hispanic/Latino          0.081689
Venezuelan                           0.068966
All Other Spanish/Hispanic/Latino    0.064309
Salvadorian                          0.051282
Dominican                            0.049180
Guatemalan                           0.048872
Puerto Rican                         0.033875
Nicaraguan                           0.028571
Mexican                              0.019098
Honduran                             0.010101
Other Cen

In [18]:

# compare Hispanic origins by rates of science occupations
df.pivot_table(index='race_recode',values='science_occupation',aggfunc='mean').sort_values(ascending=False)

race_recode
Asian               0.047009
White               0.017908
Other               0.014843
African/American    0.007671
Hispanic            0.005041
Name: science_occupation, dtype: float64

In [20]:

# compare Hispanic origins by rates of science occupations
df.pivot_table(index='race_recode',values='science_degree',aggfunc='mean').sort_values(ascending=False)

race_recode
Asian               0.238387
White               0.079698
Other               0.066158
African/American    0.039104
Hispanic            0.028697
Name: science_degree, dtype: float64