In [1]:
from __future__ import print_function

In [3]:
import pandas as pd
pd.__version__

In [8]:
# pandas 中的主要数据结构被实现为以下两类：
# DataFrame，您可以将它想象成一个关系型数据表格，其中包含多个行和已命名的列。
# Series，它是单一列。DataFrame 中包含一个或多个 Series，每个 Series 均有一个名称。

In [10]:
#构建series对象
city_names=pd.Series(['SAN F','SAN J','SACR'])
population=pd.Series([123,234,236])

In [11]:
#创建DataFrame对象
pd.DataFrame({'CITY NAME':city_names, 'POPULATION':population})

Unnamed: 0,CITY NAME,POPULATION
0,SAN F,123
1,SAN J,234
2,SACR,236


In [12]:
#将整个csv文件加载到DataFrame中
california_housing_dataframe = pd.read_csv("https://download.mlcc.google.cn/mledu-datasets/california_housing_train.csv", sep=",")

In [13]:
#显示统计信息
california_housing_dataframe.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [14]:
#显示前几个记录
california_housing_dataframe.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [20]:
#了解一个列的中值分布
california_housing_dataframe.hist('housing_median_age')

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x10ecd5d30>]], dtype=object)

In [21]:
#用Python dict/list 指令访问 DataFrame 数据
cities = pd.DataFrame({ 'City name': city_names, 'Population': population })
print(type(cities['City name']))
cities['City name']

<class 'pandas.core.series.Series'>


0    SAN F
1    SAN J
2     SACR
Name: City name, dtype: object

In [22]:
print(type(cities['City name'][1]))
cities['City name'][1]

<class 'str'>


'SAN J'

In [23]:
print(type(cities[0:2]))
cities[0:2]

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,City name,Population
0,SAN F,123
1,SAN J,234


In [26]:
population / 1000.
pd.DataFrame({'CITY NAME':city_names, 'POPULATION':population})

Unnamed: 0,CITY NAME,POPULATION
0,SAN F,123
1,SAN J,234
2,SACR,236


In [28]:
import numpy as np 
np.log(population)

0    4.812184
1    5.455321
2    5.463832
dtype: float64

In [31]:
#Series.apply 将以参数形式接受 lambda 函数，而该函数会应用于每个值。
#lambda就是一个小函数
population.apply(lambda val: val > 200)

0    False
1     True
2     True
dtype: bool

In [33]:
#添加series
cities['Area square miles'] = pd.Series([46.87, 176.53, 97.92])
cities['Population density'] = cities['Population'] / cities['Area square miles']
cities

Unnamed: 0,City name,Population,Area square miles,Population density
0,SAN F,123,46.87,2.62428
1,SAN J,234,176.53,1.325554
2,SACR,236,97.92,2.410131


In [34]:
#练习一
cities['Is wide and has saint name'] = (cities['Area square miles'] > 50) & cities['City name'].apply(lambda name: name.startswith('San'))
cities

Unnamed: 0,City name,Population,Area square miles,Population density,Is wide and has saint name
0,SAN F,123,46.87,2.62428,False
1,SAN J,234,176.53,1.325554,False
2,SACR,236,97.92,2.410131,False


In [35]:
#在构造时，pandas 会赋可反映源数据顺序的索引值。索引值在创建后是稳定的；也就是说，它们不会因为数据重新排序而发生改变。
city_names.index

RangeIndex(start=0, stop=3, step=1)

In [36]:
cities.index

RangeIndex(start=0, stop=3, step=1)

In [37]:
#手动重新排列各行的顺序
cities.reindex([2, 0, 1])

Unnamed: 0,City name,Population,Area square miles,Population density,Is wide and has saint name
2,SACR,236,97.92,2.410131,False
0,SAN F,123,46.87,2.62428,False
1,SAN J,234,176.53,1.325554,False


In [44]:
#重建索引是一种随机排列 DataFrame 的绝佳方式。在下面的示例中，我们会取用类似数组的索引，
#然后将其传递至 NumPy 的 random.permutation 函数，该函数会随机排列其值的位置。
#如果使用此重新随机排列的数组调用 reindex，会导致 DataFrame 行以同样的方式随机排列。 尝试多次运行以下单元格！
cities.reindex(np.random.permutation(cities.index))

Unnamed: 0,City name,Population,Area square miles,Population density,Is wide and has saint name
0,SAN F,123,46.87,2.62428,False
1,SAN J,234,176.53,1.325554,False
2,SACR,236,97.92,2.410131,False


In [46]:
#如果reindex 输入数组包含原始 DataFrame 索引值中没有的值，reindex 会为此类“丢失的”索引添加新行，并在所有对应列中填充 NaN 值
cities.reindex([0, 4, 5, 2])

Unnamed: 0,City name,Population,Area square miles,Population density,Is wide and has saint name
0,SAN F,123.0,46.87,2.62428,False
4,,,,,
5,,,,,
2,SACR,236.0,97.92,2.410131,False


In [None]:
#这种行为是可取的，因为索引通常是从实际数据中提取的字符串（请参阅 pandas reindex 文档，查看索引值是浏览器名称的示例）。
#在这种情况下，如果允许出现“丢失的”索引，您将可以轻松使用外部列表重建索引，因为您不必担心会将输入清理掉。