In [None]:

#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## DAY1 DATA VIZ WITH PYTHON ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs



In [None]:
#=================================================-
#### Slide 19: Loading packages  ####

import pandas as pd
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt



In [None]:
#=================================================-
#### Slide 20: Directory settings  ####

# Set `home_dir` to the root directory of your computer.
home_dir = os.path.expanduser("~")
# Set `main_dir` to the location of your `skillsoft-data-viz-with-python` folder.
main_dir = os.path.join(home_dir, "Desktop", "skillsoft-data-viz-with-python")
# Make `data_dir` from the `main_dir` and
# remainder of the path to data directory.
data_dir = os.path.join(main_dir, "data")

# Create a plot directory to save our plots
plot_dir = os.path.join(main_dir, "plots")



In [None]:
#=================================================-
#### Slide 21: Working directory  ####

# Set working directory.
os.chdir(data_dir)
# Check working directory.
print(os.getcwd())



In [None]:
#=================================================-
#### Slide 26: Load the dataset  ####

household_poverty = pd.read_csv("costa_rica_poverty.csv")
print(household_poverty.head())



In [None]:
#=================================================-
#### Slide 28: Subsetting data  ####

costa_viz = household_poverty[['household_id',
                               'ppl_total',
                               'dependency_rate',
                               'num_adults',
                               'rooms',
                               'age',
                               'monthly_rent',
                               'Target']]
print(costa_viz.head())



In [None]:
#=================================================-
#### Slide 29: Remove labels  ####

# Let's prepare the data for visualizations by removing any labels,
# removing the household_id variable, and keeping the remaining variables.
costa_viz = costa_viz.drop('household_id', axis = 1)
print(costa_viz.head())



In [None]:
#=================================================-
#### Slide 30: Data prep: clean NAs  ####

print(costa_viz.isnull().sum())



In [None]:
#=================================================-
#### Slide 31: Data cleaning: NAs  ####

# Set the dataframe equal to the imputed dataset.
costa_viz = costa_viz.fillna(costa_viz.mean())
# Check how many values are null in monthly_rent.
print(costa_viz.isnull().sum())



In [None]:
#=================================================-
#### Slide 32: Converting the target variable  ####

costa_viz['Target'] = np.where(costa_viz['Target'] <= 3, 'vulnerable', 'non_vulnerable')
print(costa_viz['Target'].head())



In [None]:
#=================================================-
#### Slide 33: Data prep: target  ####

print(costa_viz.Target.dtypes)
costa_viz["Target"] = np.where(costa_viz["Target"] == "non_vulnerable", True, False)

# Check class again.
print(costa_viz.Target.dtypes)



In [None]:
#=================================================-
#### Slide 35: Pickle cleaned dataset  ####

pickle.dump(costa_viz, open("costa_viz.sav","wb" ))



In [None]:
#=================================================-
#### Slide 37: Exercise 1  ####





In [None]:
#=================================================-
#### Slide 42: Splitting using groupby()  ####

# Group data by `Target` variable.
grouped = costa_viz.groupby('Target')



In [None]:
#=================================================-
#### Slide 45: Prepare data: group and summarize (cont'd)  ####

# Compute mean on the listed variables using the grouped data.
costa_grouped_mean = grouped.mean()[['ppl_total','dependency_rate','num_adults','rooms','age']]
print(costa_grouped_mean)
# Reset index of the dataset.
costa_grouped_mean = costa_grouped_mean.reset_index()
print(costa_grouped_mean)



In [None]:
#=================================================-
#### Slide 48: Wide to long format: melt (cont'd)  ####

# Melt the wide data into long.
costa_grouped_mean_long = pd.melt(costa_grouped_mean,       #<- wide dataset
                                  id_vars = ['Target'],     #<- identifying variable
                                  var_name = 'metric',      #<- contains col names of wide data
                                  value_name = 'mean')      #<- contains values from above columns
print(costa_grouped_mean_long)




In [None]:
#=================================================-
#### Slide 50: Long to wide format: pivot (cont'd)  ####

# Melt the long data into wide.
costa_grouped_mean_wide = costa_grouped_mean_long.pivot(
                                                    index = 'Target',   #<- identifying variable
                                                    columns = 'metric', #<- col names of wide data
                                                    values = 'mean')    #<- values from above columns
print(costa_grouped_mean_wide)



In [None]:
#=================================================-
#### Slide 51: Pickle grouped data frames  ####

pickle.dump(costa_grouped_mean_long, open("costa_grouped_mean_long.sav","wb" ))
pickle.dump(costa_grouped_mean_wide, open("costa_grouped_mean_wide.sav","wb" ))

