In [43]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use('seaborn-white')

### Introduction to Pandas and DataFrames

The Pandas library contains an important function that structures our data.  This object is called a `DataFrame` and will be important to working with real data.  To begin, we create a DataFrame, and examine some simple operations on it. 

In [44]:
#create DataFrame with three columns: GDP, Population, Year
df = pd.DataFrame( {'Year': [2017, 2007, 1997, 1987], 'Population': [343, 320, 287, 254], 'GDP': [4.3, 3.8, 2.9, 2.5]})

In [45]:
#Examine the first few rows or data
df.head()

Unnamed: 0,GDP,Population,Year
0,4.3,343,2017
1,3.8,320,2007
2,2.9,287,1997
3,2.5,254,1987


In [46]:
#examine a specific column of data
df['GDP']

0    4.3
1    3.8
2    2.9
3    2.5
Name: GDP, dtype: float64

In [47]:
#Plot from the DataFrame
plt.figure()
plt.scatter(df['GDP'], df['Population'])
plt.title("GDP vs. Population\nan introductory example", loc = 'right', fontsize = 14)
plt.xlabel("Gross Domestic Product")
plt.ylabel("Population")

<IPython.core.display.Javascript object>

Text(0,0.5,'Population')

In [6]:
#finding linear line of best fit
np.polyfit(df['GDP'], df['Population'], 1)

array([  46.70776819,  143.36128237])

In [48]:
slope, intercept = np.polyfit(df['GDP'], df['Population'], 1)

In [49]:
x = np.linspace(min(df['GDP']), max(df['GDP']), 1000)
y = slope*x + intercept

In [50]:
plt.plot(x, y, '--')

[<matplotlib.lines.Line2D at 0x1a1b37a0f0>]

### Loading Data Files

Usually, we will be creating DataFrames from real datasets.  We will take the example of the gapminder data here.  Make sure you have a folder named `data` and have loaded the file `gapminder_all.csv` into this folder.  We will read the file in and examine the first few rows as before.

In [51]:
df = pd.read_csv('data/gapminder_all.csv')

In [52]:
df.head()

Unnamed: 0,continent,country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,...,pop_1962,pop_1967,pop_1972,pop_1977,pop_1982,pop_1987,pop_1992,pop_1997,pop_2002,pop_2007
0,Africa,Algeria,2449.008185,3013.976023,2550.81688,3246.991771,4182.663766,4910.416756,5745.160213,5681.358539,...,11000948.0,12760499.0,14760787.0,17152804.0,20033753.0,23254956.0,26298373.0,29072015.0,31287142,33333216
1,Africa,Angola,3520.610273,3827.940465,4269.276742,5522.776375,5473.288005,3008.647355,2756.953672,2430.208311,...,4826015.0,5247469.0,5894858.0,6162675.0,7016384.0,7874230.0,8735988.0,9875024.0,10866106,12420476
2,Africa,Benin,1062.7522,959.60108,949.499064,1035.831411,1085.796879,1029.161251,1277.897616,1225.85601,...,2151895.0,2427334.0,2761407.0,3168267.0,3641603.0,4243788.0,4981671.0,6066080.0,7026113,8078314
3,Africa,Botswana,851.241141,918.232535,983.653976,1214.709294,2263.611114,3214.857818,4551.14215,6205.88385,...,512764.0,553541.0,619351.0,781472.0,970347.0,1151184.0,1342614.0,1536536.0,1630347,1639131
4,Africa,Burkina Faso,543.255241,617.183465,722.512021,794.82656,854.735976,743.387037,807.198586,912.063142,...,4919632.0,5127935.0,5433886.0,5889574.0,6634596.0,7586551.0,8878303.0,10352843.0,12251209,14326203


In [53]:
#list all column names
df.columns

Index(['continent', 'country', 'gdpPercap_1952', 'gdpPercap_1957',
       'gdpPercap_1962', 'gdpPercap_1967', 'gdpPercap_1972', 'gdpPercap_1977',
       'gdpPercap_1982', 'gdpPercap_1987', 'gdpPercap_1992', 'gdpPercap_1997',
       'gdpPercap_2002', 'gdpPercap_2007', 'lifeExp_1952', 'lifeExp_1957',
       'lifeExp_1962', 'lifeExp_1967', 'lifeExp_1972', 'lifeExp_1977',
       'lifeExp_1982', 'lifeExp_1987', 'lifeExp_1992', 'lifeExp_1997',
       'lifeExp_2002', 'lifeExp_2007', 'pop_1952', 'pop_1957', 'pop_1962',
       'pop_1967', 'pop_1972', 'pop_1977', 'pop_1982', 'pop_1987', 'pop_1992',
       'pop_1997', 'pop_2002', 'pop_2007'],
      dtype='object')

In [54]:
plt.figure(figsize = (12, 5))
plt.subplot(121)
plt.scatter(df['gdpPercap_1952'], df['lifeExp_1952'], s = df['pop_1952']/10**6)
plt.title("Gapminder Data from 1952\nGross Domestic Product vs. Life Expectancy", loc = 'left')

plt.subplot(122)
plt.scatter(df['gdpPercap_1997'], df['lifeExp_1997'], s = df['pop_1997']/10**6)
plt.title("Gapminder Data from 1997\nGross Domestic Product vs. Life Expectancy", loc = 'left')

<IPython.core.display.Javascript object>

Text(0,1,'Gapminder Data from 1997\nGross Domestic Product vs. Life Expectancy')

In [55]:
#plot with logarithmic scale
plt.figure(figsize = (12, 5))
plt.subplot(121)
plt.scatter(df['gdpPercap_1952'], df['lifeExp_1952'], s = df['pop_1952']/10**6)
plt.xscale('log')
plt.title("Gapminder Data from 1952\nGross Domestic Product vs. Life Expectancy", loc = 'left')

plt.subplot(122)
plt.scatter(df['gdpPercap_1997'], df['lifeExp_1997'], s = df['pop_1997']/10**6)
plt.xscale('log')
plt.title("Gapminder Data from 1997\nGross Domestic Product vs. Life Expectancy", loc = 'left')

<IPython.core.display.Javascript object>

Text(0,1,'Gapminder Data from 1997\nGross Domestic Product vs. Life Expectancy')

In [30]:
#get summary statistics for dataset
df.describe()

Unnamed: 0,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,...,pop_1962,pop_1967,pop_1972,pop_1977,pop_1982,pop_1987,pop_1992,pop_1997,pop_2002,pop_2007
count,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,...,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0
mean,3725.276046,4299.408345,4725.812342,5483.653047,6770.082815,7313.166421,7518.901673,7900.920218,8158.608521,9090.175363,...,20421010.0,22658300.0,25189980.0,27676380.0,30207300.0,33038570.0,35990920.0,38839470.0,41457590.0,44021220.0
std,9321.064786,9869.662202,8667.362525,8095.315431,10614.383403,8362.48915,7733.845006,8288.281304,9031.84608,10171.493263,...,69788650.0,78375480.0,88646820.0,97481090.0,105098600.0,114756200.0,124502600.0,133417400.0,140848300.0,147621400.0
min,298.846212,335.997115,355.203227,349.0,357.0,371.0,424.0,385.0,347.0,312.188423,...,65345.0,70787.0,76595.0,86796.0,98593.0,110812.0,125911.0,145608.0,170372.0,199579.0
25%,864.752389,930.540819,1059.149171,1151.245103,1257.193853,1357.257252,1363.338985,1327.469823,1270.660958,1366.837958,...,1784362.0,2034768.0,2351192.0,2759717.0,3006286.0,3194990.0,3605992.0,3770150.0,4173506.0,4508034.0
50%,1968.528344,2173.220291,2335.439533,2678.334741,3339.129407,3798.609244,4216.228428,4280.300366,4386.085502,4781.825478,...,4686040.0,5170176.0,5877996.0,6404036.0,7007320.0,7774862.0,8688686.0,9735064.0,10372920.0,10517530.0
75%,3913.492777,4876.356362,5709.381428,7075.932943,9508.839303,11204.102423,12347.953723,11994.052795,10684.35187,12022.867188,...,10980080.0,12614580.0,14679200.0,16670230.0,18407320.0,20947540.0,22705380.0,24311370.0,26545560.0,31210040.0
max,108382.3529,113523.1329,95458.11176,80894.88326,109347.867,59265.47714,33693.17525,31540.9748,34932.91959,41283.16433,...,665770000.0,754550000.0,862030000.0,943455000.0,1000281000.0,1084035000.0,1164970000.0,1230075000.0,1280400000.0,1318683000.0


In [56]:
#plot with logarithmic scale

plt.figure()
plt.scatter(df['gdpPercap_2007'], df['lifeExp_2007'], s = df['pop_2007']/10**6)
plt.xscale('log')
plt.title("Gapminder Data from 1952\nGross Domestic Product vs. Life Expectancy", loc = 'left')


<IPython.core.display.Javascript object>

Text(0,1,'Gapminder Data from 1952\nGross Domestic Product vs. Life Expectancy')

In [58]:
#create a new column that is the log
#of the GDP column rather than using logarithmic axis
from scipy import stats

df["LogGDP_2007"] = np.log(df['gdpPercap_2007'])
m, b, r, p, stderr = stats.linregress(df["LogGDP_2007"], df["lifeExp_2007"])
def mod(x):
    return m*x + b

print("The slope is ", m, "\nThe Intercept is ", b, "\nand the r value is", r,
     "\np value is, ", p)

The slope is  7.20280157101 
The Intercept is  4.94961165106 
and the r value is 0.808980251485 
p value is,  4.11537000056e-34


In [59]:
plt.figure()
plt.scatter(df["LogGDP_2007"], df["lifeExp_2007"], s = df["pop_2007"]/10**6, label = '')
plt.plot(df["LogGDP_2007"], mod(df["LogGDP_2007"]), '-r', label = "Line of Best Fit")
plt.legend(loc = 'best', frameon = False)
plt.title("Transformed Data and its Line of Best Fit")

<IPython.core.display.Javascript object>

Text(0.5,1,'Transformed Data and its Line of Best Fit')

In [36]:
africa = df[df.continent == "Africa"]

In [60]:
africa.head()

Unnamed: 0,continent,country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,...,pop_1967,pop_1972,pop_1977,pop_1982,pop_1987,pop_1992,pop_1997,pop_2002,pop_2007,LogGDP_2007
0,Africa,Algeria,2449.008185,3013.976023,2550.81688,3246.991771,4182.663766,4910.416756,5745.160213,5681.358539,...,12760499.0,14760787.0,17152804.0,20033753.0,23254956.0,26298373.0,29072015.0,31287142,33333216,8.736066
1,Africa,Angola,3520.610273,3827.940465,4269.276742,5522.776375,5473.288005,3008.647355,2756.953672,2430.208311,...,5247469.0,5894858.0,6162675.0,7016384.0,7874230.0,8735988.0,9875024.0,10866106,12420476,8.475794
2,Africa,Benin,1062.7522,959.60108,949.499064,1035.831411,1085.796879,1029.161251,1277.897616,1225.85601,...,2427334.0,2761407.0,3168267.0,3641603.0,4243788.0,4981671.0,6066080.0,7026113,8078314,7.27329
3,Africa,Botswana,851.241141,918.232535,983.653976,1214.709294,2263.611114,3214.857818,4551.14215,6205.88385,...,553541.0,619351.0,781472.0,970347.0,1151184.0,1342614.0,1536536.0,1630347,1639131,9.439057
4,Africa,Burkina Faso,543.255241,617.183465,722.512021,794.82656,854.735976,743.387037,807.198586,912.063142,...,5127935.0,5433886.0,5889574.0,6634596.0,7586551.0,8878303.0,10352843.0,12251209,14326203,7.104171


In [38]:
#now we can do same thing to other continents or we could create a dataframe
#with just the three continents using appropriate filters

In [39]:
plt.figure()
plt.hist(africa["LogGDP_2007"], color = 'grey')
plt.title("Histogram of African Countries GDP 2007(logarithm)", loc = 'left')

<IPython.core.display.Javascript object>

Text(0,1,'Histogram of African Countries GDP 2007(logarithm)')

In [41]:
import seaborn as sns

In [42]:
sns.jointplot(x="LogGDP_2007", y="lifeExp_2007", data = df, kind = "reg")

<IPython.core.display.Javascript object>

<seaborn.axisgrid.JointGrid at 0x1a197fa9b0>