In [1]:
import pandas as pd
import numpy as np

In [2]:
# data frame, a primary way data scientists handle data
# may be created from csv files, querying databases, or explicitly
cindys_array = np.array([['Montgomery','Yellohammer state',52423],
                     ['Sacramento','Golden state',163707],
                     ['Oklahoma City','Sooner state',69960 ]])

df = pd.DataFrame(cindys_array)
df

Unnamed: 0,0,1,2
0,Montgomery,Yellohammer state,52423
1,Sacramento,Golden state,163707
2,Oklahoma City,Sooner state,69960


In [3]:
df.columns = ['Capital', 'Nickname', 'Area']
df.index = ['Alabama', 'California', 'Oklahoma']
df

Unnamed: 0,Capital,Nickname,Area
Alabama,Montgomery,Yellohammer state,52423
California,Sacramento,Golden state,163707
Oklahoma,Oklahoma City,Sooner state,69960


In [4]:
df2 = pd.DataFrame(
    cindys_array,
    columns=['Capital', 'Nickname', 'Area'],
    index=['Alabama', 'California', 'Oklahoma'])
    
df2

Unnamed: 0,Capital,Nickname,Area
Alabama,Montgomery,Yellohammer state,52423
California,Sacramento,Golden state,163707
Oklahoma,Oklahoma City,Sooner state,69960


In [5]:
# row names
names = ['George',
        'John',
        'Thomas',
        'James',
        'Andrew',
        'Martin',
        'William',
        'Zachary',
        'Millard',
        'Franklin']

# create an empty df with named rows
purchases = pd.DataFrame(index=names)

# add our columns to the df one at a time
purchases['country'] = ['US', 'CAN', 'CAN', 'US', 'CAN', 'US', 'US', 'US', 'CAN', 'US']
purchases['ad_views'] = [16, 42, 32, 13, 63, 19, 65, 23, 16, 77]
purchases['items_purchased'] = [2, 1, 0, 8, 0, 5, 7, 3, 0, 5]
purchases 

Unnamed: 0,country,ad_views,items_purchased
George,US,16,2
John,CAN,42,1
Thomas,CAN,32,0
James,US,13,8
Andrew,CAN,63,0
Martin,US,19,5
William,US,65,7
Zachary,US,23,3
Millard,CAN,16,0
Franklin,US,77,5


In [6]:
purchases.country

George       US
John        CAN
Thomas      CAN
James        US
Andrew      CAN
Martin       US
William      US
Zachary      US
Millard     CAN
Franklin     US
Name: country, dtype: object

In [7]:
# this and the cell above are equivalent, though bracket notation favored
# not sure why this doesn't work with 'names'
purchases['country']

George       US
John        CAN
Thomas      CAN
James        US
Andrew      CAN
Martin       US
William      US
Zachary      US
Millard     CAN
Franklin     US
Name: country, dtype: object

In [8]:
purchases['items_purch_per_ad'] = purchases['items_purchased'] / purchases['ad_views']
purchases

Unnamed: 0,country,ad_views,items_purchased,items_purch_per_ad
George,US,16,2,0.125
John,CAN,42,1,0.02381
Thomas,CAN,32,0,0.0
James,US,13,8,0.615385
Andrew,CAN,63,0,0.0
Martin,US,19,5,0.263158
William,US,65,7,0.107692
Zachary,US,23,3,0.130435
Millard,CAN,16,0,0.0
Franklin,US,77,5,0.064935


In [9]:
purchases['items_purchased'] / purchases['ad_views']

George      0.125000
John        0.023810
Thomas      0.000000
James       0.615385
Andrew      0.000000
Martin      0.263158
William     0.107692
Zachary     0.130435
Millard     0.000000
Franklin    0.064935
dtype: float64

## Selecting and grouping

In [10]:
purchases['country']

George       US
John        CAN
Thomas      CAN
James        US
Andrew      CAN
Martin       US
William      US
Zachary      US
Millard     CAN
Franklin     US
Name: country, dtype: object

In [11]:
# .loc indexes over rows and columns
purchases.loc['George']

country                  US
ad_views                 16
items_purchased           2
items_purch_per_ad    0.125
Name: George, dtype: object

In [12]:
purchases.loc[:, 'country'] # the : works as a list or string slice does

George       US
John        CAN
Thomas      CAN
James        US
Andrew      CAN
Martin       US
William      US
Zachary      US
Millard     CAN
Franklin     US
Name: country, dtype: object

In [13]:
purchases.loc['George', 'country']

'US'

In [14]:
# .iloc does integer indexing [index, column]
purchases.iloc[1:3, 1]

John      42
Thomas    32
Name: ad_views, dtype: int64

In [15]:
purchases.loc[lambda df: purchases['items_purchased'] > 1, :]

Unnamed: 0,country,ad_views,items_purchased,items_purch_per_ad
George,US,16,2,0.125
James,US,13,8,0.615385
Martin,US,19,5,0.263158
William,US,65,7,0.107692
Zachary,US,23,3,0.130435
Franklin,US,77,5,0.064935


In [16]:
purchases[purchases['items_purchased'] > 1]

Unnamed: 0,country,ad_views,items_purchased,items_purch_per_ad
George,US,16,2,0.125
James,US,13,8,0.615385
Martin,US,19,5,0.263158
William,US,65,7,0.107692
Zachary,US,23,3,0.130435
Franklin,US,77,5,0.064935


In [17]:
purchases.groupby('country')

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x00000201C9C6FB70>

In [18]:
purchases.groupby('country').aggregate(np.mean)

Unnamed: 0_level_0,ad_views,items_purchased,items_purch_per_ad
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CAN,38.25,0.25,0.005952
US,35.5,5.0,0.217767


In [20]:
# getting the mean of a single column, as opposed to all as above
purchases.groupby('country')['ad_views'].mean()

country
CAN    38.25
US     35.50
Name: ad_views, dtype: float64

# Working with files in pandas

In [21]:
df = pd.read_csv('purchases.csv')
print(df)

  Unnamed: 0 country  ad_views  items_purchased
0     George      US        16                2
1       John     CAN        42                1
2     Thomas     CAN        32                0
3      James      US        13                8
4     Andrew     CAN        63                0
5     Martin      US        19                5
6    William      US        65                7
7    Zachary      US        23                3
8    Millard     CAN        16                0
9   Franklin      US        77                5


In [22]:
df

Unnamed: 0.1,Unnamed: 0,country,ad_views,items_purchased
0,George,US,16,2
1,John,CAN,42,1
2,Thomas,CAN,32,0
3,James,US,13,8
4,Andrew,CAN,63,0
5,Martin,US,19,5
6,William,US,65,7
7,Zachary,US,23,3
8,Millard,CAN,16,0
9,Franklin,US,77,5


In [23]:
# copies data to new file 'my_data.csv'
df.to_csv('my_data.csv')

In [25]:
df = pd.read_json('purchases.json')
print(df)

  Unnamed: 0 country  ad_views  items_purchased
0     George      US        16                2
1       John     CAN        42                1
2     Thomas     CAN        32                0
3      James      US        13                8
4     Andrew     CAN        63                0
5     Martin      US        19                5
6    William      US        65                7
7    Zachary      US        23                3
8    Millard     CAN        16                0
9   Franklin      US        77                5


In [26]:
df.to_json('my_data.json')

In [27]:
serialized_purchases = df.to_json() # ?

In [29]:
pd.read_json('my_data.json')

Unnamed: 0.1,Unnamed: 0,country,ad_views,items_purchased
0,George,US,16,2
1,John,CAN,42,1
2,Thomas,CAN,32,0
3,James,US,13,8
4,Andrew,CAN,63,0
5,Martin,US,19,5
6,William,US,65,7
7,Zachary,US,23,3
8,Millard,CAN,16,0
9,Franklin,US,77,5


In [30]:
# Pandas doesn't have an equivalent read_xml, so will have to import a library
import xml.etree.ElementTree as ET

In [31]:
# loads and parses through the XML file into a 'tree'?
tree = ET.parse('purchases.xml')

In [32]:
# finding the root of the tree
# node of the start of iteration
root = tree.getroot()

In [35]:
# this fxn is specific to this XML file; will need to iterate differently in
# different XML structures
def xml_to_list(root):
    result = []
    for row in root:
        row_list = []
        for column in row:
            row_list.append(column.text)
        result.append(row_list)
    return result

In [37]:
df = pd.DataFrame(xml_to_list(root))
print(df)

          0    1   2  3
0    George   US  16  2
1      John  CAN  42  1
2    Thomas  CAN  32  0
3     James   US  13  8
4    Andrew  CAN  63  0
5    Martin   US  19  5
6   William   US  65  7
7   Zachary   US  23  3
8   Millard  CAN  16  0
9  Franklin   US  77  5


#### File formats
- often will not get to choose how we receive it, may have to deal with XML files or CSV or JSON
- preference to avoid XML if writing data yourself
- CSV ideal, though JSON better in situations needing semi-structure

#### Opening files in general in Python

In [38]:
with open('poem.txt') as poem_file:
    text = poem_file.readlines()
    print("This file is {} lines long".format(len(text)))
    for line in text:
        print(line)

This file is 19 lines long
Beautiful is better than ugly.

Explicit is better than implicit.

Simple is better than complex.

Complex is better than complicated.

Flat is better than nested.

Sparse is better than dense.

Readability counts.

Special cases aren't special enough to break the rules.

Although practicality beats purity.

Errors should never pass silently.

Unless explicitly silenced.

In the face of ambiguity, refuse the temptation to guess.

There should be one-- and preferably only one --obvious way to do it.

Although that way may not be obvious at first unless you're Dutch.

Now is better than never.

Although never is often better than *right* now.

If the implementation is hard to explain, it's a bad idea.

If the implementation is easy to explain, it may be a good idea.

Namespaces are one honking great idea -- let's do more of those!


In [42]:
# LOVE the with statement!! with this, Python makes it ok to forget close()
# after the open() statements, that would otherwise clog up resources
# the with statement auto-closes file after the statement exits
with open('purchases.csv') as purchases:
    text = purchases.readlines()
    print("This file is {} lines long".format(len(text)))
    for line in text:
        print(line)

This file is 11 lines long
,country,ad_views,items_purchased

George,US,16,2

John,CAN,42,1

Thomas,CAN,32,0

James,US,13,8

Andrew,CAN,63,0

Martin,US,19,5

William,US,65,7

Zachary,US,23,3

Millard,CAN,16,0

Franklin,US,77,5
