# 1. Let's load needed libraries

In [12]:
# we import the library pandas and give it the "pd" kickname
import pandas as pd

# 2. Let's load the gapminder dataset

In [13]:
# we use pandas.read_csv() function to access the file "gapminder.tsv" stored in a remote location 

# the remote location is: https://raw.githubusercontent.com/thousandoaks/BEMM458/master/data/

# with the argument sep='\t' we indicate that the columns are separated by tabs rather than commas.

df = pd.read_csv('https://raw.githubusercontent.com/thousandoaks/BEMM458/master/data/gapminder.tsv', sep='\t')



### df is a DataFrame.
### DataFrames are core entities in data analytics

In [14]:
type(df)

pandas.core.frame.DataFrame

# 3. Let's observe our data

In [15]:
# we show the first 5 rows
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [16]:
# we show the size of our dataset
df.shape

(1704, 6)

In [17]:
# we get some more detailed info on our dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


## 3.1. Let's extract some columns from our data

In [18]:
# we can extract a column by its name
df['country']

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
5       Afghanistan
6       Afghanistan
7       Afghanistan
8       Afghanistan
9       Afghanistan
10      Afghanistan
11      Afghanistan
12          Albania
13          Albania
14          Albania
15          Albania
16          Albania
17          Albania
18          Albania
19          Albania
20          Albania
21          Albania
22          Albania
23          Albania
24          Algeria
25          Algeria
26          Algeria
27          Algeria
28          Algeria
29          Algeria
           ...     
1674    Yemen, Rep.
1675    Yemen, Rep.
1676    Yemen, Rep.
1677    Yemen, Rep.
1678    Yemen, Rep.
1679    Yemen, Rep.
1680         Zambia
1681         Zambia
1682         Zambia
1683         Zambia
1684         Zambia
1685         Zambia
1686         Zambia
1687         Zambia
1688         Zambia
1689         Zambia
1690         Zambia
1691         Zambia
1692       Zimbabwe


In [19]:
# we can extract several columns at the same time
df[['country','lifeExp']]

Unnamed: 0,country,lifeExp
0,Afghanistan,28.801
1,Afghanistan,30.332
2,Afghanistan,31.997
3,Afghanistan,34.020
4,Afghanistan,36.088
5,Afghanistan,38.438
6,Afghanistan,39.854
7,Afghanistan,40.822
8,Afghanistan,41.674
9,Afghanistan,41.763


## 3.2. Let's extract some rows from our data

In [20]:
# let's extract the first row. Python starts counting from zero
df.iloc[0]

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 0, dtype: object

In [21]:
# let's extract the 100th row. Python starts counting from zero
df.iloc[99]

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap       721.186
Name: 99, dtype: object

In [22]:
# we can even select multiple rows

df.iloc[[0,99,999]]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
99,Bangladesh,Asia,1967,43.453,62821884,721.186086
999,Mongolia,Asia,1967,51.253,1149500,1226.04113


# 6. Challenge yourself ! 

## 6.1.   What was the maximum life expectancy evolution across time AND country ?

### Hint 1: use the followint command to display all rows in a pandas DataFrame

pandas.set_option('display.max_rows', None)

### Hint 2: look on the pandas documentation how to compute the maximum after a groupby operation

### https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html