<a href="https://colab.research.google.com/github/muyezhu/connectome/blob/master/cic_skills_pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# recap data types in python: boolean, numeric, string
a = 123      # a is a variable of numeric type
b = '123'    # b is a variable of string type
c = True     # c is a variable of boolean type
print(type(a))
print(type(b))
print(type(c))

In [0]:
print(a * 3)

In [0]:
d = b * 3
print(d)
print(type(d))

In [0]:
# make the pandas available
import pandas as pd
# url to the life expectancy data csv file
life_expectancy_url = 'https://drive.google.com/uc?export=download&id=1lfeD_Y4F-pMarx-M_pJM9W6aCe6pSPnr'
# read the csv file into a "DataFrame", a core object in pandas library
df = pd.read_csv(life_expectancy_url, header=2, skip_blank_lines=True, sep=',', quotechar='"')


In [0]:
# take a quick look at the first 10 rows
df.head(n=10)

In [0]:
# how many rows and columns does the data frame have
print(df.shape)

In [0]:
# each column has a name
print('column names: ')
print(df.columns.values)

In [0]:
# each row has an index (and can be uniquely identified using the index)
print('row index: ')
print(df.index.values)

In [0]:
# clean up the data frame: remove columns where all life expectancy data is missing
df_clean = df.dropna(axis=1, how='all')
df_clean.head(n=10)

In [0]:
# new shape
print(df_clean.shape)

In [0]:
# which columns are dropped?
print(set(df.columns.values) - set(df_clean.columns.values))

In [0]:
# clean up the data frame: remove rows where all life expectancy data is missing
first_four_col_names = df.columns.values[0:4]
# select the first four columns with function loc()
df_first_four_cols = df.loc[:, first_four_col_names]
df_first_four_cols.head(n=10)

In [0]:
# isna(): returns data frame with boolean values. for each cell, if the value is NaN (aka missing value), True is returned
df_first_four_cols.isna().head(n=10)

In [0]:
# is there any missing values in the first three columns? 
print('any missing values in the first three columns? ', 'yes' if df_first_four_cols.isna().any(axis=None) else 'no')

In [0]:
# keep a row if at least one cell in columns '1960' to '2017' has numeric data (aka is not NaN)
df_clean = df_clean.dropna(axis=0, thresh=5)
df_clean.head(n=10)

In [0]:
print(df_clean.shape)

In [0]:
# which rows are dropped?
print(set(df.index.values) - set(df_clean.index.values))

In [0]:
# which countries do the dropped rows correspond to?
dropped_indices = sorted([index for index in (set(df.index.values) - set(df_clean.index.values))])
df_dropped_countries = df.loc[dropped_indices, :]
print(df_dropped_countries.shape)
df_dropped_countries.head(n=10)

In [0]:
df_clean = df_clean.drop(columns=df_clean.columns.values[1:4])
df_clean.head()

In [0]:
df_melt = df_clean.melt(id_vars=['Country Name'], var_name='year', value_name='life expectancy')
df_melt.head(20)


In [0]:
df_clean_rows, df_clean_cols = df_clean.shape
df_melt_rows = df_melt.shape[0]
print('number of years = ', df_clean_cols - 1)
print(df_melt_rows / df_clean_rows)

In [0]:
df_melt['year'] = df_melt['year'].astype('int32')
df_melt.plot.scatter(x='year', y='life expectancy', s=1)

In [0]:
# what happened at ~ 1978?
min_index_1978 = df_clean['1978'].idxmin()
print(df_clean['Country Name'][min_index_1978])

In [0]:
# select the row corresponding to Cambodia
df_cambodia = df_melt[df_melt['Country Name'] == 'Cambodia']
df_cambodia.head()

In [0]:
df_cambodia.plot(x='year', y='life expectancy')

In [0]:
# Cambodian genocide (April 17, 1975 – January 7, 1979). death toll is 1.5 ~ 2 million, near a quarter of its population at the time

In [0]:
df_timor_leste = df_melt[df_melt['Country Name'] == 'Timor-Leste']
df_timor_leste.plot(x='year', y='life expectancy')

In [0]:
# Indonesian occupation of East Timor (1975–1999)

In [0]:
#@title
# Rwandan genocide 
min_index_1992 = df_clean['1992'].idxmin()
print(df_clean['Country Name'][min_index_1992])
df_rwanda = df_melt[df_melt['Country Name'] == 'Rwanda']
df_rwanda.plot(x='year', y='life expectancy')