# Lecture08 Some Pandas Operations

In [4]:
# Includes and Standard Magic...
### Standard Magic and startup initializers.

# Load Numpy
import numpy as np
# Load MatPlotLib
import matplotlib
import matplotlib.pyplot as plt
# Load Pandas
import pandas as pd

# This lets us show plots inline and also save PDF plots if we want them
%matplotlib inline
from matplotlib.backends.backend_pdf import PdfPages
matplotlib.style.use('fivethirtyeight')

# These two things are for Pandas, it widens the notebook and lets us display data easily.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Show a ludicrus number of rows and columns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Pandas Essential Functionality

Here are some implemented examples from the slides that show off some of the nice functionality of Pandas.

To set a **hierarchical index** one can refer to the [documetnation page](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html).

In [5]:
# Setting a heiararichal index -- The bad way would be to build up tuples as an index.

index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [6]:
# Now we can still get slices etc. in a strightforward way.
pop[('California', 2010):('Texas', 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [7]:
# But if we want to get all 2010 data we have to do something like...
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [8]:
#We can make the data frame as (note it is tidy!)
df = pd.DataFrame([('California', 2000, 33871648),
                   ('California', 2010, 37253956),
                   ('New York', 2000, 18976457),
                   ('New York', 2010, 19378102),
                   ('Texas', 2000, 20851820),
                   ('Texas', 2010, 25145561)],
                  columns=['state', 'year', 'pop'])
df

Unnamed: 0,state,year,pop
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [9]:
df.set_index(['state', 'year'], inplace=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,pop
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [10]:
# And now we can do cool stuff slicing, but it gets compicated with tuples.
df.loc[('California')]

Unnamed: 0_level_0,pop
year,Unnamed: 1_level_1
2000,33871648
2010,37253956


In [11]:
df.index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           names=['state', 'year'])

In [13]:
# Thought this can get a bit complicated... 
df.xs(2010, level=1)

Unnamed: 0_level_0,pop
state,Unnamed: 1_level_1
California,37253956
New York,19378102
Texas,25145561


## Melting (down) Data Example

In [14]:
df = pd.read_csv('./data/religon.csv')
df

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k
0,Agnostic,27,34,60,81,76,137
1,Atheist,12,27,37,52,35,70
2,Buddhist,27,21,30,34,33,58
3,Catholic,418,617,732,670,638,1116
4,Dont know/refused,15,14,15,11,10,35
5,Evangelical Prot,575,869,1064,982,881,1486
6,Hindu,1,9,7,9,11,34
7,Historically Black Prot,228,244,236,238,197,223
8,Jehovahs Witness,20,27,24,24,21,30
9,Jewish,19,19,25,25,30,95


In [15]:
f_df = pd.melt(df,
               ["religion"],
               var_name="income",
               value_name="freq")
f_df = f_df.sort_values(by=["income"])
f_df.head(10)


Unnamed: 0,religion,income,freq
14,Dont know/refused,$10-20k,14
15,Evangelical Prot,$10-20k,869
18,Jehovahs Witness,$10-20k,27
13,Catholic,$10-20k,617
12,Buddhist,$10-20k,21
11,Atheist,$10-20k,27
10,Agnostic,$10-20k,34
19,Jewish,$10-20k,19
17,Historically Black Prot,$10-20k,244
16,Hindu,$10-20k,9


In [21]:
f_df[(f_df['religion'] == 'Atheist')].sort_values(by='income')

Unnamed: 0,religion,income,freq
11,Atheist,$10-20k,27
41,Atheist,$40-50k,35
1,Atheist,<$10k,12
21,Atheist,$20-30k,37
31,Atheist,$30-40k,52
51,Atheist,$50-75k,70


## More Complicated Example.

In [None]:
b_df = pd.read_csv("./data/billboard.csv")
b_df.head()

In [None]:
# Keep identifier variables
id_vars = ["year",
           "artist.inverted",
           "track",
           "time",
           "genre",
           "date.entered",
           "date.peaked"]

# Melt the rest into week and rank columns
b_df = pd.melt(frame=b_df,
             id_vars=id_vars,
             var_name="week",
             value_name="rank")


In [None]:
display(b_df.head(20))
b_df.dtypes

In [None]:
# Let's fix the week thing, it's not good...
b_df["week"] = b_df['week'].str.extract('(\d+)', expand=False).astype(int)
# Why not ints? Gotcha!
b_df["rank"] = b_df["rank"].astype(float)


In [None]:
#b_df['date.entered']

In [None]:
# Cleaning out unnecessary rows
b_df = b_df.dropna()

# Create "date" columns
#  date = (date entered chart) + (# of weeks) - (1 week) [fence post problem]
b_df['date'] = pd.to_datetime(b_df['date.entered']) + pd.to_timedelta(b_df['week'], unit='w') - pd.DateOffset(weeks=1) 

In [None]:
b_df.head(10)

In [None]:
# Ignore now-redundant, messy columns -- same as dropping 
b_df = b_df[["year",
         "artist.inverted",
         "track",
         "time",
         "genre",
         "week",
         "rank",
         "date"]]

b_df = b_df.sort_values(ascending=True, by=["year","artist.inverted","track","week","rank"])

# Keep tidy dataset for future usage
billboard = b_df

b_df.head(10)


In [None]:
billboard['genre'].unique()