# Load libraries and dataframe

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

df = pd.read_csv("../input/adult-census-income/adult.csv")
df

# Initial exploration of the data


In [None]:
df.describe()

In [None]:
df.head()

# Data Cleaning: Remove rows with question marks

In [None]:
df = df[~((df['age'] == '?' ) | (df['workclass'] == '?' ) | (df['education'] == '?' ) | (df['education.num'] == '?' ) | (df['marital.status'] == '?' ) | (df['occupation'] == '?' ) | (df['relationship'] == '?' )  | (df['race'] == '?' ) | (df['sex'] == '?' ) | (df['capital.gain'] == '?' ) | (df['capital.loss'] == '?' ) | (df['hours.per.week'] == '?' )  | (df['native.country'] == '?' ) | (df['income'] == '?' ))]


# Data Cleaning: Correcting column names to be more appropriate for Pandas

In [None]:
df.columns = ['age', 'workclass', 'fnlwgt', 'edu', 'edu_num', 'marital', 'occupation', 'relationship', 'race', 'sex', 'capgain', 'caploss', 'hourspw', 'country', 'income']
df

# Data Cleaning: Making a new column that is the difference of capgain and caploss

In [None]:
# df['capital.agg'] = df['capital.gain'] - df['capital.loss']
# df = df.assign(capital.agg = df['capital.gain'] - df['capital.loss'])

pd.set_option('mode.chained_assignment', None)

df.loc[:,"capagg"] = df.capgain - df.caploss
df

# Data Cleaning: Making a subset that only has people born outside of US 

In [None]:
foreignborn = df.loc[df['country'] != "United-States"]
foreignborn

# Summarizing some aspect of the data


In [None]:
print("Descriptive statistics of entire data base")
df_summary = df.describe()
df_summary

In [None]:
print("Descriptive statistics of immigrants subset")
fb_summary = foreignborn.describe()
fb_summary

# Summarizing some aspect of the data: What are the differences beteween the general df and the immigrants df?

In [None]:
ne = df_summary - fb_summary
ne

In [None]:
# (ggplot(mpg)
#  + aes(x='displ', y='hwy', color='class')
#  + geom_point()
#  + labs(title='Engine Displacement vs. Highway Miles per Gallon', x='Engine Displacement, in Litres', y='Highway Miles per Gallon')
# )

from pandas.api.types import CategoricalDtype
from plotnine import *
from plotnine.data import mpg
%matplotlib inline
(ggplot(foreignborn)
+ aes(x = 'country', fill = 'income') 
+ geom_bar(position = "fill") 
+ coord_flip()
+ labs(y = "Proportion"))

In [None]:
(ggplot(foreignborn)
+ aes(x = 'country', fill = 'workclass') 
+ geom_bar(position = "fill") 
+ coord_flip()
+ labs(y = "Proportion"))

In [None]:
(ggplot(foreignborn)
+ aes(x = 'workclass', fill = 'income') 
+ geom_bar(position = "fill") 
+ coord_flip()
+ labs(y = "Proportion"))

# Data Visualization: Box plots of education and income vs relationship roles 

In [None]:
ax = sns.catplot(x="edu_num", y="relationship", hue="income",  kind="box", data=df)


# Data Visualization: Box plots of education level and race

In [None]:
sns.boxplot(y=df["race"], x=df["edu_num"])


# Data Visualization: Boxplots of education level and race for immigrants

In [None]:
sns.boxplot(y=foreignborn["race"], x=foreignborn["edu_num"])


# Data Visualization: Education level, sex vs race

In [None]:
ax = sns.catplot(x="edu_num", y="race", hue="sex",  kind="box", data=df)


# Data Visualization: Marital status, sex vs education level

In [None]:
ax = sns.catplot(x="edu_num", y="marital", hue="sex",  kind="box", data=df)


# Data Visualization: Marital status, sex vs hours of work per week

In [None]:
ax = sns.boxplot(x="hourspw", y="marital", hue="sex", data=df)


# Data Visualization: Marital Status, income, vs hours of work per week

In [None]:
ax = sns.boxplot(x="hourspw", y="marital", hue="income", data=df)


# Data Visualization: Marital Status, income, vs education level

In [None]:
ax = sns.boxplot(x="edu_num", y="marital", hue="income", data=df)


# Data Visualization: Age vs Education level

In [None]:
sns.lmplot(x="age", y="edu_num", data=df, x_estimator=np.mean);
