In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr
from scipy.stats import gaussian_kde

In [34]:
#loading up data frame and subsetting it to Texas and variable of interests
dfb = pd.read_csv('/home/minh/portfolio/census/pums/ss13pusb.csv')
data = dfb.loc[dfb['ST'] == 48, ['ST','WAGP','RAC1P', "JWMNP"]]

#renaming columns
data = data.rename(columns = {
    'ST': 'state',
    'WAGP': 'wage',
    'RAC1P': 'race',
    'JWMNP': 'commute'
})
#save file 
data.to_csv('/home/minh/portfolio/census/data.csv') 

In [3]:
#use this file for subsequent analysis
data = pd.read_csv('/home/minh/portfolio/census/data.csv')

In [4]:
#preprocessing and exploratory analysis

#removing missing data
data = data[np.isfinite(data['commute'])] 

#renaming the numeric code into strings
data['race'] = data['race'].astype('category')
race = {"White": [1], "Black": [2], "American or Alaskan Native": [3,4,5], "Asian/Pacific Islander": [6,7],
    "Other Race": [8], "Two or More Races": [9]}
race2 = {v: k for k,vv in race.items() for v in vv}
data['race'] = data['race'].map(race2).astype("category", categories = set(race2.values()))

#scaling data
data['commute'].describe()
data['wage'].describe()
data['wage_scaled'] = (data['wage'] - data['wage'].mean())/data['wage'].std()
data['wage_scaled'].describe()
data_scaled = data[(data['wage_scaled'] < 3) & (data['wage_scaled'] > -3)]

In [5]:
data_scaled['wage'].describe()

count    104682.000000
mean      39935.226744
std       36003.812477
min           0.000000
25%       14000.000000
50%       30000.000000
75%       55000.000000
max      211000.000000
Name: wage, dtype: float64

In [75]:
#geting histogram of incomes
plt.hist(data['wage'], 50, facecolor='green')
plt.xlabel('Wage/Income in the last 12 months')
plt.ylabel('Number of People')
plt.title('Histogram of Income')
plt.axis([data['wage'].min(), data['wage'].max(), 0, 200000)
plt.show()

#plotting wages and commute time
plt.plot(data_scaled['wage'], data_scaled['commute'], 'ro', alpha = 0.2)
plt.xlabel('Wage/Income in the Last 12 Months')
plt.ylabel('Commute Times (in minutes)')
plt.show()

# different view of plotting, looking at the density of where the data converges
xy = np.vstack([data_scaled['wage'], data_scaled['commute']])
z = gaussian_kde(xy)(xy)
fig, ax = plt.subplots()
ax.scatter(data_scaled['wage'], data_scaled['commute'], c=z, edgecolor='')
plt.show()

In [31]:
if (data['wage'] < 9225):
    data['incomeGroup'] = "Low"

SyntaxError: invalid syntax (<ipython-input-31-f8e10130d1be>, line 5)

In [35]:
#breaking income into US tax brackets
data['income_group'] = pd.cut(data['wage'], bins = [0,9225,37450,90750,189300,411500],
                             labels = ['lowest', 'low', 'middle','high','very high'])
data['income_group'].value_counts()

low          41768
middle       35011
lowest       12839
high          8752
very high      615
dtype: int64