<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/mock_belt_exam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
from google.colab import drive
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [57]:
path = '/content/drive/MyDrive/Coding Dojo/03 Week 3: Exploratory Viz/literacy_rates_updated.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Region,Country,Year,Age,Gender,Literacy rate
0,Central and Southern Asia,Afghanistan,2011,<15,female,0.1761206
1,Central and Southern Asia,Afghanistan,2011,<15,male,0.454171
2,,Afghanistan,2011,15-24,female,0.3211322
3,,Afghanistan,2011,15-24,male,0.6187907
4,Central and Southern Asia,Afghanistan,2011,25-64,female,0.0841276


In [58]:
# Check for duplicates
df.duplicated().sum()
df[df.duplicated(keep = False)]
df.drop_duplicates(inplace = True)
df.duplicated().sum()

0

In [59]:
# Check for missing values
df.isna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3303 entries, 0 to 3312
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Region         3299 non-null   object
 1   Country        3303 non-null   object
 2   Year           3303 non-null   object
 3   Age            3303 non-null   object
 4   Gender         3303 non-null   object
 5   Literacy rate  3298 non-null   object
dtypes: object(6)
memory usage: 180.6+ KB


In [60]:
# Check the Region column.
df.loc[df['Region'].isna(), :]

# We can use the entry in 'Country' to determine
# what to use for 'Region' for missing values.
# All the missing entries in 'Region' should be
# 'Central and Southern Asia'
df['Region'] = df['Region'].fillna('Central and Southern Asia')
df['Region'].isna().sum()

0

In [61]:
# Check the 'Literacy rate' column
print(df.loc[df['Literacy rate'].isna(), :])

# Check the literacy rates of people in Spain less than 65 years old
df.loc[(df['Country'] == 'Spain') & (df['Age'] != '65+'), 'Literacy rate'].sort_values()

# Check the literacy rates of Europeans between 15 and 64.
df.loc[(df['Region'] == 'Europe and Northern America') & (df['Age'] == '15-24'), 'Literacy rate'].sort_values()

# Check the literacy rates of people over 65 in Singapore
df.loc[(df['Country'] == 'Singapore') & (df['Age'] == '65+'), :].sort_values(by = 'Literacy rate')

# The literacy rate in Spain for those under 65 is between 97% and 100%
# The literacy rate in Europe and North America for people between 15 and 24 is also between 97% and 100%
# So we will impute the missing values in Europe with the average of the literacy
# rates in Spain for those under 65 and the average literacy for 15-24 year olds in
# Europe and North America.

# In 2010 the literace rate for men over 65 was about 94.6% The value in 2015 for men
# over 65 in Singapore is missing. The literacy rate for women over 65 in Singapore
# increased by about 12%. We will assume that "the rising tide lifts all boats" so men
# over 65 also enjoyed a significant increase in literacy.

# Based on the above observations we will impute the missing values for literacy rates 
# with the average literacy rate for the region 'Europe and Northern America.'

# Get the average literacy rate for the region 'Europe and Northern America'.
avg_lit_rate = df.loc[df['Region'] == 'Europe and Northern America', 'Literacy rate'].dropna().astype(float)
print()
avg_lit_rate = avg_lit_rate.mean()
avg_lit_rate

# Fill the missing values.
df.fillna(avg_lit_rate, inplace = True)

                             Region    Country  Year    Age  Gender  \
498  Eastern and South-Eastern Asia  Singapore  2010    65+    male   
608     Europe and Northern America    Albania  2012  15-24  female   
753     Europe and Northern America   Portugal  2011  15-24    male   
855     Europe and Northern America      Spain  2010    <15  female   
884     Europe and Northern America      Spain  2013  25-64    male   

    Literacy rate  
498           NaN  
608           NaN  
753           NaN  
855           NaN  
884           NaN  



In [62]:
# Confirm there are no missing values
df.isna().sum()

Region           0
Country          0
Year             0
Age              0
Gender           0
Literacy rate    0
dtype: int64

In [63]:
# We now ensure each column is of the correct data type
df.info()

# Fix the literacy rate column.
#df['Literacy rate'].astype(float)

# In the above step we found an inconsistency in the literacy rate column.
filter = df['Literacy rate'].str.contains('%').astype(bool)
df[filter]

# Not sure why .str.contains('%') included rows without percent signs.
# There seems to be only one offending row so we will adjust it manually.
df.loc[12, 'Literacy rate'] = 0.4538

# Now fix the literacy rate column
df['Literacy rate'] = df['Literacy rate'].astype(float)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3303 entries, 0 to 3312
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Region         3303 non-null   object
 1   Country        3303 non-null   object
 2   Year           3303 non-null   object
 3   Age            3303 non-null   object
 4   Gender         3303 non-null   object
 5   Literacy rate  3303 non-null   object
dtypes: object(6)
memory usage: 180.6+ KB


In [64]:
# Check that the Literacy rate column has been fixed
df.info()

# We need to fix the year column now.
#df['Year'] = df['Year'].astype(int)

# The above step revealed a year with an inderscore
filter = df['Year'].str.contains('_')
df.loc[filter, 'Year']

# Replace the underscore with an empyt space
df['Year'] = df['Year'].str.replace('_', '')

# Now convert years to ints
df['Year'] = df['Year'].astype(int)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3303 entries, 0 to 3312
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Region         3303 non-null   object 
 1   Country        3303 non-null   object 
 2   Year           3303 non-null   object 
 3   Age            3303 non-null   object 
 4   Gender         3303 non-null   object 
 5   Literacy rate  3303 non-null   float64
dtypes: float64(1), object(5)
memory usage: 309.7+ KB


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3303 entries, 0 to 3312
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Region         3303 non-null   object 
 1   Country        3303 non-null   object 
 2   Year           3303 non-null   int64  
 3   Age            3303 non-null   object 
 4   Gender         3303 non-null   object 
 5   Literacy rate  3303 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 309.7+ KB
