### Import packages

In [1]:
import pandas as pd 
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns

In [2]:
%matplotlib inline

### Read csv and view first 6 rows of data

In [3]:
gdp_df = pd.read_csv('..\data\gdp_percapita.csv')

In [4]:
gdp_df.head(6)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
0,Afghanistan,2018,1734.723214,
1,Afghanistan,2017,1758.465636,
2,Afghanistan,2016,1757.02349,
3,Afghanistan,2015,1766.593077,
4,Afghanistan,2014,1795.735834,
5,Afghanistan,2013,1807.762344,


### Repeat above steps for the internet_use.csv

In [5]:
internet_df = pd.read_csv('..\data\internet_use.csv', error_bad_lines=False)

b'Skipping line 4675: expected 4 fields, saw 6\n'


In [6]:
internet_df.head(6)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
0,Afghanistan,2014,6.39,
1,Afghanistan,2013,5.9,
2,Afghanistan,2012,5.454545,
3,Afghanistan,2011,5.0,
4,Afghanistan,2010,4.0,
5,Afghanistan,2009,3.55,


### Look at the shape of each data frame

In [7]:
gdp_df.shape

(6731, 4)

In [8]:
internet_df.shape

(4675, 4)

### Look at the data types for each column

In [9]:
gdp_df.dtypes

Country or Area     object
Year                object
Value              float64
Value Footnotes    float64
dtype: object

In [10]:
internet_df.dtypes

Country or Area     object
Year                object
Value              float64
Value Footnotes    float64
dtype: object

### Look at the last ten rows of data for each data frame

In [11]:
gdp_df.tail(10)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
6721,Zimbabwe,1997,3036.422224,
6722,Zimbabwe,1996,2985.856605,
6723,Zimbabwe,1995,2736.486436,
6724,Zimbabwe,1994,2768.309953,
6725,Zimbabwe,1993,2572.870395,
6726,Zimbabwe,1992,2591.007534,
6727,Zimbabwe,1991,2906.272849,
6728,Zimbabwe,1990,2819.549467,
6729,footnoteSeqID,Footnote,,
6730,2,"Excludes South Sudan after July 9, 2011.",,


In [12]:
internet_df.tail(10)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
4665,170,Refers to the total population.,,
4666,171,Internet Dial-up customers.,,
4667,172,Population age 16+ using the Internet in the l...,,
4668,173,Population age 16+ using internet in the last ...,,
4669,174,"U.S. Census Bureau, Table 2. Reported Internet...",,
4670,175,Includes individuals 3 years and older,,
4671,176,NTIA/CPS survey.,,
4672,178,Estimated based on Survey's results. Populatio...,,
4673,179,Preliminary. Country estimate.,,
4674,180,The methodology used to estimated the figure f...,,


### Drop 'Value Footnotes' and change column names for both data frames

In [13]:
gdp_df = gdp_df.drop(['Value Footnotes'], axis = 1)

In [14]:
internet_df = internet_df.drop(['Value Footnotes'], axis = 1)

In [15]:
gdp_df.columns = ['Country', 'Year', 'GDP_Per_Capita']

In [16]:
internet_df.columns = ['Country', 'Year', 'Internet_Users_Pct']

In [17]:
gdp_df.head()

Unnamed: 0,Country,Year,GDP_Per_Capita
0,Afghanistan,2018,1734.723214
1,Afghanistan,2017,1758.465636
2,Afghanistan,2016,1757.02349
3,Afghanistan,2015,1766.593077
4,Afghanistan,2014,1795.735834


In [18]:
internet_df.head()

Unnamed: 0,Country,Year,Internet_Users_Pct
0,Afghanistan,2014,6.39
1,Afghanistan,2013,5.9
2,Afghanistan,2012,5.454545
3,Afghanistan,2011,5.0
4,Afghanistan,2010,4.0


### Merge the two data frames into one

In [19]:
gdp_and_internet_use = pd.merge(gdp_df, internet_df, how = 'inner')

In [24]:
gdp_and_internet_use = gdp_and_internet_use.reset_index(drop = True)

In [25]:
gdp_and_internet_use = gdp_and_internet_use.drop([3519])

In [26]:
gdp_and_internet_use.tail()

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
3514,Zimbabwe,1997,3036.422224,0.03308
3515,Zimbabwe,1996,2985.856605,0.01679
3516,Zimbabwe,1995,2736.486436,0.007684
3517,Zimbabwe,1994,2768.309953,0.001739
3518,Zimbabwe,1990,2819.549467,0.0


In [27]:
gdp_and_internet_use.shape

(3519, 4)