# 利用python进行数据分析

McKinney著  
机械工业出版社  
2014年1月第1版  

本书代码及数据集下载地址：[GitHub - wesm/pydata-book: Materials and IPython notebooks for "Python for Data Analysis" by Wes McKinney, published by O'Reilly Media](https://github.com/wesm/pydata-book)

gitbook地址：[README - 利用Python进行数据分析·第2版](https://seancheney.gitbook.io/python-for-data-analysis-2nd/)

## 第2章 引言

### 1. 普通方法

In [8]:
import os
import json

usaDataFile = open(os.path.join(os.path.dirname(os.getcwd()),'files', 'example.txt'))
records = [json.loads(line) for line in usaDataFile]
records[0]

{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 'c': 'US',
 'nk': 1,
 'tz': 'America/New_York',
 'gr': 'MA',
 'g': 'A6qOVH',
 'h': 'wfLQtf',
 'l': 'orofrog',
 'al': 'en-US,en;q=0.8',
 'hh': '1.usa.gov',
 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991',
 't': 1331923247,
 'hc': 1331822918,
 'cy': 'Danvers',
 'll': [42.576698, -70.954903]}

In [9]:
timeZones = [rec['tz'] for rec in records if 'tz' in rec]

In [28]:
# 方法1
def getCount(sequence):
    count = {}
    for x in sequence:
        if x in count:
            count[x] += 1
        else:
            count[x] = 1
    return count

# 方法2
def getCount2(sequence):
    count = {}
    for x in sequence:
        count.setdefault(x, 0)
        count[x] += 1
    return count

# 方法3
from collections import defaultdict
def getCount3(sequence):
    count = defaultdict(int)
    for x in sequence:
        count[x] += 1
    return count

In [29]:
from collections import Counter
count = Counter(timeZones)
count.most_common(10)

[('America/New_York', 1251),
 ('', 521),
 ('America/Chicago', 400),
 ('America/Los_Angeles', 382),
 ('America/Denver', 191),
 ('Europe/London', 74),
 ('Asia/Tokyo', 37),
 ('Pacific/Honolulu', 36),
 ('Europe/Madrid', 35),
 ('America/Sao_Paulo', 33)]

### 2. pandas

In [30]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np

frame = DataFrame(records)

In [32]:
frame['tz'][:10]

0     America/New_York
1       America/Denver
2     America/New_York
3    America/Sao_Paulo
4     America/New_York
5     America/New_York
6        Europe/Warsaw
7                     
8                     
9                     
Name: tz, dtype: object