In [1]:
# The usual preamble
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

plt.rcParams['figure.figsize'] = (15, 5)

In [2]:
# the number of rows and columns
df = pd.read_csv('countries.csv')
print("Cols", len(df.columns))
print("Rows", len(df.index))

Cols 5
Rows 5


In [3]:
# for every column: the min
df.min(axis=0)

Name        Brazilia
People      36503097
Area          301338
BIP             1529
Currency         CAD
dtype: object

In [4]:
# for every column: the max
df.max(axis=0)

Name            Japan
People      208360000
Area          9984670
BIP              4938
Currency          YEN
dtype: object

In [5]:
# for every column: the mean
df.mean(axis=0)

People    102786293.6
Area        3907399.6
BIP            2716.2
dtype: float64

In [6]:
# Show the last 4 rows of the data frame.
df.tail(4)

Unnamed: 0,Name,People,Area,BIP,Currency
1,Japan,126045000,377835,4938,YEN
2,Canada,36503097,9984670,1529,CAD
3,Italy,60501718,301338,1850,EUR
4,Brazilia,208360000,8515770,1798,REAL


In [7]:
# Show all the row of countries who have "EUR" as currency.
df[df['Currency'] == "EUR"]

Unnamed: 0,Name,People,Area,BIP,Currency
0,Germany,82521653,357385,3466,EUR
3,Italy,60501718,301338,1850,EUR


In [8]:
# Show only "Name" and "Currency" in a new data frame.
new_df = df.copy()[['Name', 'Currency']]
new_df

Unnamed: 0,Name,Currency
0,Germany,EUR
1,Japan,YEN
2,Canada,CAD
3,Italy,EUR
4,Brazilia,REAL


In [9]:
# Show only the rows/countries that have more than 2000 BIP
df[df['BIP'] > 2000]

Unnamed: 0,Name,People,Area,BIP,Currency
0,Germany,82521653,357385,3466,EUR
1,Japan,126045000,377835,4938,YEN


In [10]:
# Select all countries with inhabitants between 50 and 150 Mio
df[(df['People'] > 50000000) & (df['People'] < 150000000)]

Unnamed: 0,Name,People,Area,BIP,Currency
0,Germany,82521653,357385,3466,EUR
1,Japan,126045000,377835,4938,YEN
3,Italy,60501718,301338,1850,EUR


In [11]:
# Change the column name "BIP" to "Bip".
df.rename(columns={'BIP': 'Bip'}, inplace=True)
df

Unnamed: 0,Name,People,Area,Bip,Currency
0,Germany,82521653,357385,3466,EUR
1,Japan,126045000,377835,4938,YEN
2,Canada,36503097,9984670,1529,CAD
3,Italy,60501718,301338,1850,EUR
4,Brazilia,208360000,8515770,1798,REAL


In [12]:
# Calculate the "Bip" sum over all rows
df[['Bip']].aggregate(sum)


Bip    13581
dtype: int64

In [13]:
# Calculate the average people of all countries.
df[['People']].mean(axis=0)

People    102786293.6
dtype: float64

In [14]:
# Sort by "Name" alphabetically.
df.sort_values('Name', inplace=True)
df

Unnamed: 0,Name,People,Area,Bip,Currency
4,Brazilia,208360000,8515770,1798,REAL
2,Canada,36503097,9984670,1529,CAD
0,Germany,82521653,357385,3466,EUR
3,Italy,60501718,301338,1850,EUR
1,Japan,126045000,377835,4938,YEN


In [15]:
# Create a new data frame from the original where the area is changed as follows: 
#  all countries with > 1000000 get "BIG" and <= 1000000 get "SMALL" in the cell replaced
df2 = df.copy()

df2.loc[df2['Area'].astype(int) <= 1000000, 'Area'] = "1"
df2.loc[df2['Area'].astype(int) > 1000000,  'Area'] = "0"

df2.loc[df2['Area'] == "0",  'Area'] = "BIG"
df2.loc[df2['Area'] == "1", 'Area']  = "SMALL"

df2

Unnamed: 0,Name,People,Area,Bip,Currency
4,Brazilia,208360000,BIG,1798,REAL
2,Canada,36503097,BIG,1529,CAD
0,Germany,82521653,SMALL,3466,EUR
3,Italy,60501718,SMALL,1850,EUR
1,Japan,126045000,SMALL,4938,YEN
