In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Make plots show in notebook without having to use plt.show()
%matplotlib inline


In [2]:
gdp_df = pd.read_csv('gdp_percapita.csv')
print(gdp_df.head(6))

  Country or Area  Year        Value  Value Footnotes
0     Afghanistan  2016  1802.695566              NaN
1     Afghanistan  2015  1809.016488              NaN
2     Afghanistan  2014  1838.960244              NaN
3     Afghanistan  2013  1848.700026              NaN
4     Afghanistan  2012  1839.273579              NaN
5     Afghanistan  2011  1660.739856              NaN


In [3]:
internet_df = pd.read_csv('internet_use.csv')
print(internet_df.head(6))

  Country or Area  Year     Value  Value Footnotes
0     Afghanistan  2014  6.390000              NaN
1     Afghanistan  2013  5.900000              NaN
2     Afghanistan  2012  5.454545              NaN
3     Afghanistan  2011  5.000000              NaN
4     Afghanistan  2010  4.000000              NaN
5     Afghanistan  2009  3.550000              NaN


In [4]:
# Shape of each dataframe
print(gdp_df.shape)
print(internet_df.shape)

(6206, 4)
(4495, 4)


In [5]:
# Data types for the columns
print(gdp_df.info())
print(internet_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6206 entries, 0 to 6205
Data columns (total 4 columns):
Country or Area    6206 non-null object
Year               6206 non-null int64
Value              6206 non-null float64
Value Footnotes    1 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 194.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4495 entries, 0 to 4494
Data columns (total 4 columns):
Country or Area    4495 non-null object
Year               4495 non-null int64
Value              4495 non-null float64
Value Footnotes    948 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 140.5+ KB
None


In [6]:
print(gdp_df.tail(10))

     Country or Area  Year        Value  Value Footnotes
6196        Zimbabwe  1999  2699.857521              NaN
6197        Zimbabwe  1998  2761.318537              NaN
6198        Zimbabwe  1997  2725.888701              NaN
6199        Zimbabwe  1996  2698.917300              NaN
6200        Zimbabwe  1995  2488.298028              NaN
6201        Zimbabwe  1994  2529.826671              NaN
6202        Zimbabwe  1993  2360.793284              NaN
6203        Zimbabwe  1992  2384.972026              NaN
6204        Zimbabwe  1991  2681.495089              NaN
6205        Zimbabwe  1990  2605.794944              NaN


In [7]:
print(internet_df.tail(10))

     Country or Area  Year     Value  Value Footnotes
4485        Zimbabwe  2002  3.994356              NaN
4486        Zimbabwe  2001  0.799846              NaN
4487        Zimbabwe  2000  0.401434              NaN
4488        Zimbabwe  1999  0.161676              NaN
4489        Zimbabwe  1998  0.081648              NaN
4490        Zimbabwe  1997  0.033080              NaN
4491        Zimbabwe  1996  0.016790              NaN
4492        Zimbabwe  1995  0.007684              NaN
4493        Zimbabwe  1994  0.001739              NaN
4494        Zimbabwe  1990  0.000000              NaN


In [8]:
#Droppint the Value Footnotes column
gdp_df = gdp_df.drop('Value Footnotes', axis=1)
internet_df = internet_df.drop('Value Footnotes', axis=1)

In [9]:
gdp_df.info()
internet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6206 entries, 0 to 6205
Data columns (total 3 columns):
Country or Area    6206 non-null object
Year               6206 non-null int64
Value              6206 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 145.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4495 entries, 0 to 4494
Data columns (total 3 columns):
Country or Area    4495 non-null object
Year               4495 non-null int64
Value              4495 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 105.4+ KB


In [11]:
gdp_df.columns = ['Country', 'Year', 'GDP_Per_Capita']
internet_df.columns = ['Country', 'Year', 'Internet_Users_Pct']

In [21]:
gdp_and_internet_use = pd.merge(gdp_df, internet_df, how = 'outer')

In [22]:
gdp_and_internet_use.head()

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
0,Afghanistan,2016,1802.695566,
1,Afghanistan,2015,1809.016488,
2,Afghanistan,2014,1838.960244,6.39
3,Afghanistan,2013,1848.700026,5.9
4,Afghanistan,2012,1839.273579,5.454545


In [23]:
gdp_and_internet_use.tail()

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
7172,Virgin Islands (U.S.),1997,,6.948369
7173,Virgin Islands (U.S.),1996,,4.647186
7174,Virgin Islands (U.S.),1995,,2.801958
7175,Virgin Islands (U.S.),1994,,0.940645
7176,Virgin Islands (U.S.),1990,,0.0


In [41]:
mask = (gdp_and_internet_use['Year'] == 2004) | (gdp_and_internet_use['Year'] == 2009) | (gdp_and_internet_use['Year'] == 2014)
gdp_and_internet_use_pruned = gdp_and_internet_use[mask]

In [47]:
# Checking that only 2014, 2009, and 2004 survived.
gdp_and_internet_use_pruned.Year.unique()

array([2014, 2009, 2004])

In [54]:
df2014 = gdp_and_internet_use_pruned[gdp_and_internet_use_pruned['Year'] == 2014]
df2009 = gdp_and_internet_use_pruned[gdp_and_internet_use_pruned['Year'] == 2009]
df2004 = gdp_and_internet_use_pruned[gdp_and_internet_use_pruned['Year'] == 2004]

In [69]:
# The country with the highest percentage of internet users in 2014.
df2014.loc[df2014.Internet_Users_Pct.idxmax(), :]

Country               Iceland
Year                     2014
GDP_Per_Capita        41701.1
Internet_Users_Pct      98.16
Name: 2332, dtype: object