In [None]:
from IPython.display import YouTubeVideo

YouTubeVideo("w0kfPhGau5c")



# [Pandas Data Wrangling](http://pandas.pydata.org/)

In [None]:
% matplotlib inline

In [None]:
import os
import sqlite3 as sqlite
DATADIR = os.path.join("..", "Resources")
os.path.exists(DATADIR)

In [None]:
import pandas as pd

In [None]:
%matplotlib inline
import numpy as np

## Reading in NaN/Missing Values, etc.

## Modifying Values 
### ``replace()``


In [None]:
elevation = pd.read_table(os.path.join(DATADIR,"elevation.txt"))

In [None]:
elevation

This data is easy to read, but had to compute with. First off, we have numeric values except for "Sea level". We can use ``replace`` to replace "Sea level" with "0 feet" to match the other cells

In [None]:
elevation.replace("Sea level","0 feet")


We can also use regular expressions to identify and change data.

In [None]:
import re
r2 = re.compile("feet")

In [None]:
elevation.replace("Sea level","0 feet").replace(r2,"").to_csv(os.path.join(DATADIR,
                                                                           "elevation2.txt"),
                                                             sep="\t")
elevation2=elevation.replace("Sea level","0 feet").replace(r2,"")
elevation2

## What is the data type of the elevation cells

In [None]:
print(elevation2["Highest elevation"][0])
print(type(elevation2["Highest elevation"][0]))


## Can we convert elevations to numeric values

In [None]:
help(elevation2.convert_objects)

In [None]:
elevation2.convert_objects(convert_numeric=True)

### Pandas didn't know how to deal with "," in numbers

* Two approaches
    1. When reading in numeric values we can specify the ``thousands`` keywoard
    2. We can use the [``locale``](https://docs.python.org/3.5/library/locale.html#module-locale) package
* I saved the dataframe where we had replaced ``Sea level`` and ``feet``

In [None]:
pd.read_table(os.path.join(DATADIR,"elevation2.txt"),
                           thousands=",",
                           index_col='State')

In [None]:
import locale
print(locale.getlocale())


In [None]:
help(locale.setlocale)

In [None]:
locale.setlocale(locale.LC_NUMERIC, '') # I'm a little confused by this
elevation2['Lowest elevation'] = \
elevation2.apply(lambda row: locale.atof(row['Lowest elevation']),
                     axis=1)
elevation2['Highest elevation'] = \
elevation2.apply(lambda row: locale.atof(row['Highest elevation']),
                     axis=1)
elevation2['Average elevation'] = \
elevation2.apply(lambda row: locale.atof(row['Average elevation']),
                     axis=1)
elevation2

In [None]:
conversion = locale.localeconv()
locale.getlocale()

## Creating a new column

In [None]:
elevation2['Range elevation'] = \
elevation2.apply(lambda row: row['Highest elevation'] - 
                             row['Lowest elevation'],axis=1 )
elevation2

## Dropping Rows based on values

In [None]:
elevation2[elevation2['Highest elevation'] > 8000]

In [None]:
pd.read_table(os.path.join(DATADIR,"mimic2_radreports_100.txt"))

### ``dropna()``

``dropna()`` allows us to drop rows and columns that have **any** or **all** NaN values

In [None]:
pd.read_table(
        os.path.join(DATADIR,
                     "mimic2_radreports_100.txt")).dropna(how="all")

In [None]:
reports = pd.read_table(
        os.path.join(DATADIR,
                     "mimic2_radreports_100.txt"),
                       usecols=["subject_id", "charttime", "text"]).dropna(how="all")

In [None]:
reports.head()

In [None]:
reports.iloc[0]["text"]