# Day 1 - Data viz with Python - Exercises with answers

## Exercise 1

#### Question 1
##### Import the required packages.
##### Set the working directory to data directory.
##### Print the working directory.

#### Answer:

In [40]:
import numpy as np
import pandas as pd
import os
import pickle
import matplotlib.pyplot as plt

In [50]:
# Set `home_dir` to the root directory of your computer.
home_dir = os.path.expanduser("~")
# Set `main_dir` to the location of your `skillsoft-data-viz-with-python` folder.
main_dir = os.path.join(home_dir, "Desktop", "skillsoft-data-viz-with-python")

# Make `data_dir` from the `main_dir` and remainder of the path to data directory.
data_dir = os.path.join(main_dir,"data")

# Create a plot directory to save our plots
plot_dir = os.path.join(main_dir, "plots")

In [51]:
# Set working directory.
os.chdir(data_dir)

In [None]:
# Check working directory.
print(os.getcwd())

#### Question 2
##### Load the `chicago_census.csv` dataset. 
##### Save it as `chicago_census`.
##### View the first few rows of `chicago_census`.

#### Answer:

In [46]:
chicago_census = pd.read_csv("chicago_census.csv")

# First five rows using .head().
chicago_census.head()

Unnamed: 0,Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX
0,1.0,Rogers Park,7.7,23.6,8.7,18.2,27.5,23939,39.0
1,2.0,West Ridge,7.8,17.2,8.8,20.8,38.5,23040,46.0
2,3.0,Uptown,3.8,24.0,8.9,11.8,22.2,35787,20.0
3,4.0,Lincoln Square,3.4,10.9,8.2,13.4,25.5,37524,17.0
4,5.0,North Center,0.3,7.5,5.2,4.5,26.2,57123,6.0


#### Question 3

###### Rename the column names as the following (and double check they are renamed by looking at `.columns`:

In [18]:
chicago_census.rename(columns={'Community Area Number': 'community_number',
                               'COMMUNITY AREA NAME': 'community_area' , 
                               'PERCENT OF HOUSING CROWDED': 'percent_house_crowded', 
                               'PERCENT HOUSEHOLDS BELOW POVERTY': 'percent_house_below_poverty', 
                               'PERCENT AGED 16+ UNEMPLOYED': 'percent_16_unemployed' , 
                               'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA': 'percent_25_without_diploma', 
                               'PERCENT AGED UNDER 18 OR OVER 64': 'percent_dependent', 
                               'PER CAPITA INCOME ' : 'per_capita_income',
                               'HARDSHIP INDEX': 'hardship_index'}, inplace = True)

#### Answer:

In [19]:
# Check the columns were renamed.
chicago_census.columns

Index(['community_number', 'community_area', 'percent_house_crowded',
       'percent_house_below_poverty', 'percent_16_unemployed',
       'percent_25_without_diploma', 'percent_dependent', 'per_capita_income',
       'hardship_index'],
      dtype='object')

#### Question 4
##### Drop columns `community_number` and `community_area` from the dataframe.
##### Look for NAs in `chicago_census` and impute with the mean of the column.
##### Check for NAs again to make sure the data looks good.

#### Answer:

In [20]:
chicago_census = chicago_census.drop(chicago_census[['community_number', 'community_area']], axis = 1)

In [21]:
# Check for NAs.
chicago_census.isnull().sum(axis = 0)

percent_house_crowded          0
percent_house_below_poverty    0
percent_16_unemployed          0
percent_25_without_diploma     0
percent_dependent              0
per_capita_income              0
hardship_index                 1
dtype: int64

In [22]:
# Fill the hardship_index with the mean of the column.
chicago_census.hardship_index = chicago_census.hardship_index.fillna(chicago_census.hardship_index.mean())
# Check for NAs again:
chicago_census.isnull().sum(axis = 0)

percent_house_crowded          0
percent_house_below_poverty    0
percent_16_unemployed          0
percent_25_without_diploma     0
percent_dependent              0
per_capita_income              0
hardship_index                 0
dtype: int64

#### Question 5
##### Create a variable `mean_per_capita_income` which contains the mean of `per_capita_income` from `chicago_census`.
##### Convert `per_capita_income` variable to a binary variable where `per_capita_income` is set to 0 if per_capita_income of the row is less than `mean per_capita_income`, otherwise set `per_capita_income` to 1.
###### Make a new duplicate dataframe called `ex_viz`.

#### Answer:

In [23]:
# Define per_capita_income mean.
mean_per_capita_income = chicago_census['per_capita_income'].mean()
chicago_census['per_capita_income'] = np.where(chicago_census['per_capita_income'] < mean_per_capita_income, 0, 1)
ex_viz = chicago_census

#### Question 6
##### Create a pickle for the cleaned data frame `ex_viz` 

#### Answer:

In [24]:
pickle.dump(ex_viz, open("ex_viz.sav","wb" ))

## Exercise 2

#### Question 1

##### Group `ex_viz` data by the `per_capita_income` variable. Save as `ex_grouped`.
##### Then group and summarize the numeric variables `community_number`, `percent_house_crowded`, 
##### `percent_house_below_poverty`,`percent_16_unemployed`, `percent_25_without_diploma`, `percent_dependent`, 
###### `hardship_index` by `per_capita_income` using their means. Save as `ex_grouped_mean` and print.
##### Reset its index and print the result.

#### Answer:

In [25]:
# Group data by `per_capita_income` variable.
ex_grouped = ex_viz.groupby('per_capita_income')

In [26]:
# Compute mean on the listed variables using the grouped data.
ex_grouped_mean = ex_grouped.mean()[['percent_house_crowded','percent_house_below_poverty','percent_16_unemployed','percent_25_without_diploma','percent_dependent','hardship_index']]
print(ex_grouped_mean)

                   percent_house_crowded  percent_house_below_poverty  \
per_capita_income                                                       
0                               6.288000                    26.780000   
1                               2.478571                    12.739286   

                   percent_16_unemployed  percent_25_without_diploma  \
per_capita_income                                                      
0                              18.810000                   26.092000   
1                               9.146429                   10.042857   

                   percent_dependent  hardship_index  
per_capita_income                                     
0                          38.834000       66.640000  
1                          30.153571       18.910946  


In [27]:
ex_grouped_mean = ex_grouped.mean().reset_index()
print(ex_grouped_mean)

   per_capita_income  percent_house_crowded  percent_house_below_poverty  \
0                  0               6.288000                    26.780000   
1                  1               2.478571                    12.739286   

   percent_16_unemployed  percent_25_without_diploma  percent_dependent  \
0              18.810000                   26.092000          38.834000   
1               9.146429                   10.042857          30.153571   

   hardship_index  
0       66.640000  
1       18.910946  


#### Question 2

##### Notice the format of `ex_grouped_mean`. We wish to convert it from wide to long format.
##### Use the `pd.melt()` function and convert it to long format. Save as `ex_grouped_mean_long` and print the result.

#### Answer:

In [28]:
# Melt the wide data into long.
ex_grouped_mean_long = pd.melt(ex_grouped_mean,       #<- wide dataset
                               id_vars = ['per_capita_income'],  #<- identifying variable
                               var_name = 'metric',      #<- contains col names of wide data
                               value_name = 'mean')      #<- contains values from above columns
print(ex_grouped_mean_long)

    per_capita_income                       metric       mean
0                   0        percent_house_crowded   6.288000
1                   1        percent_house_crowded   2.478571
2                   0  percent_house_below_poverty  26.780000
3                   1  percent_house_below_poverty  12.739286
4                   0        percent_16_unemployed  18.810000
5                   1        percent_16_unemployed   9.146429
6                   0   percent_25_without_diploma  26.092000
7                   1   percent_25_without_diploma  10.042857
8                   0            percent_dependent  38.834000
9                   1            percent_dependent  30.153571
10                  0               hardship_index  66.640000
11                  1               hardship_index  18.910946


#### Question 3
##### Now use the `pd.pivot()` function to convert `ex_grouped_mean_long` to wide format.
##### Save as `ex_grouped_mean_wide` and print.

#### Answer:

In [29]:
# Melt the long data into wide.
ex_grouped_mean_wide = ex_grouped_mean_long.pivot(   
                                      index = 'per_capita_income',     #<- identifying variable
                                      columns = 'metric', #<- contains col names of wide data
                                      values = 'mean')    #<- contains values from above columns
print(ex_grouped_mean_wide)

metric             hardship_index  percent_16_unemployed  \
per_capita_income                                          
0                       66.640000              18.810000   
1                       18.910946               9.146429   

metric             percent_25_without_diploma  percent_dependent  \
per_capita_income                                                  
0                                   26.092000          38.834000   
1                                   10.042857          30.153571   

metric             percent_house_below_poverty  percent_house_crowded  
per_capita_income                                                      
0                                    26.780000               6.288000  
1                                    12.739286               2.478571  


#### Question 4
##### Pickle the data frames `ex_grouped_mean_wide` and `ex_grouped_mean_long`.


#### Answer:

In [30]:
pickle.dump(ex_grouped_mean_long, open("ex_grouped_mean_long.sav","wb" ))
pickle.dump(ex_grouped_mean_wide, open("ex_grouped_mean_wide.sav","wb" ))