In [1]:
# 本节讲的主要是数据转换相关的工作
# 合并数据集
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series

In [2]:
# 一、移除重复数据
data = DataFrame({'k1':['one'] * 3+ ['two'] *4,
                  'k2':[1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [3]:
data.duplicated() # DataFrame的duplicated方法返回一个布尔型Series、表示各行是否是重复行：默认会判断全部列

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [4]:
data.drop_duplicates()  # 用于返回一个移除了重复行的DataFrame,默认会判断全部列

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [5]:
data['v1'] = range(7) 
print(data)
data.drop_duplicates(['k1'])  #  可以以列表的形式，传入参数，指定部分列进行重复项判断


k1  k2  v1
0  one   1   0
1  one   1   1
2  one   2   2
3  two   3   3
4  two   3   4
5  two   4   5
6  two   4   6


Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


In [6]:
# drop_duplicates函数和duplicated函数都是默认保留的第一个出现的值的组合，即参数取值为take_last=False
data.drop_duplicates(['k1', 'k2'],keep='last')

Unnamed: 0,k1,k2,v1
1,one,1,1
2,one,2,2
4,two,3,4
6,two,4,6


In [7]:
# 二、利用函数或映射进行数据转换
# 可能会有根据数组、Series或Dataframe列中的值来实现该转换工作

In [8]:
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef',
                          'Bacon', 'pastrami', 'honey ham', 'nova lox'],
                  'ounces':[4, 3, 12, 6, 7.5, 8, 3, 5,6]
    
                  })
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [9]:
# 要根据映射来添加一列来表示该肉类食物来源的动物类型
meat_to_animal = { 'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow',
                  'honey ham':'pig','nova lox':'salmon'}
meat_to_animal

{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}

In [10]:
# 首先将data的各个值先转为小写
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [11]:
# 也可以传入一个能够完成全部这些工作的函数
data['food'].map(lambda x:meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [12]:
g = lambda x:meat_to_animal[x.lower()]  # lambda的用法
g('bacon')

'pig'

In [13]:
# 三、替换值
# 1.fillna方法提供缺失值填充，可以看做值替换一种特殊的情况。
# 2.replace方法可以进行替换

In [14]:
data = Series([1.,-999.,2.,-999.,-1000.,3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [15]:
data.replace(-999,np.nan)


0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [16]:
data.replace([-999.,-1000.], np.nan)  # 一次性替换多个值

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [17]:
data.replace([-999,-1000],[np.nan, 0])  # 对不同的值进行不同的替换

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [18]:
# 传入的参数也可以是字典类型
data.replace({-999: np.nan, -1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [19]:
# 重命名轴索引
data = DataFrame(np.arange(12).reshape((3,4)),index=['Ohio','Colorade','New York']                 ,columns=['one','two','three','four'] )
data


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorade,4,5,6,7
New York,8,9,10,11


In [20]:
# 跟series一样，轴标签也有一个map方法：
data.index.map(str.upper)


Index(['OHIO', 'COLORADE', 'NEW YORK'], dtype='object')

In [21]:
data.index = data.index.map(str.upper)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADE,4,5,6,7
NEW YORK,8,9,10,11


In [22]:
# 如果 想要创建数据集的转换版，而不是修改原始数据，比较实用的方法是rename
data.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorade,4,5,6,7
New York,8,9,10,11


In [23]:
#  另外需要说明一下的是，rename方法可以结合字典型对象实现对部分轴标签的更新
data.rename(index={'OHIO':'INDIANA'},columns={'three':'peekaboo'})
# rename帮我们实现了：复制DataFrame并对其索引和列标签进行赋值

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLORADE,4,5,6,7
NEW YORK,8,9,10,11


In [24]:
# 如果希望就地修改某个数据集，传入inplace=True就可以
_ = data.rename(index={'OHIO':'INDIANA'},inplace=False)
data


Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADE,4,5,6,7
NEW YORK,8,9,10,11


In [25]:
_ = data.rename(index={'OHIO':'INDIANA'},inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLORADE,4,5,6,7
NEW YORK,8,9,10,11


In [26]:
# 离散化和面元（bin）划分
# 连续数据常常被离散化或拆分为“面元”bin
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45,41, 32]
bins = [18,25,35,60,100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [27]:
# 返回的是一个特殊的Categorical对象，可以看做一组表示面元名称的字符串。含有一个不同分类名称的codes数# 组以及一个为年龄数据进行标号的categories属性  
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [28]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [29]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [30]:
pd.cut(ages, [18, 26, 36,61,100], right=False) # 哪边是闭端可以通过right=False进行修改


[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [31]:
# 也可以设置面元名称，将labels选项设置为一个列表或数组即可
group_names = ['Youth','YoungAdult','MiddleAged','Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [32]:
# 也可以不传入如何区分面元而只传入面元的数量，会自动进行分组
data = np.random.rand(20)
pd.cut(data,4 ,precision=2 )

[(0.071, 0.3], (0.071, 0.3], (0.74, 0.97], (0.3, 0.52], (0.74, 0.97], ..., (0.52, 0.74], (0.74, 0.97], (0.52, 0.74], (0.3, 0.52], (0.3, 0.52]]
Length: 20
Categories (4, interval[float64]): [(0.071, 0.3] < (0.3, 0.52] < (0.52, 0.74] < (0.74, 0.97]]

In [33]:
# qcut是一个类似于cut的函数，可以根据样本分位数对数据进行面元划分。根据数据的分布情况cut可能无法使各个面元中含有相同数量的数据点，而qcut由于使用样本分位数，可以得到大小相等的面元
data = np.random.rand(1000)
cats = pd.qcut(data,4)  # 按照四分位数进行分割
cats

[(0.224, 0.483], (0.731, 0.998], (0.224, 0.483], (-0.0007000000000000001, 0.224], (0.483, 0.731], ..., (0.731, 0.998], (-0.0007000000000000001, 0.224], (0.483, 0.731], (0.731, 0.998], (0.224, 0.483]]
Length: 1000
Categories (4, interval[float64]): [(-0.0007000000000000001, 0.224] < (0.224, 0.483] < (0.483, 0.731] < (0.731, 0.998]]

In [34]:
pd.value_counts(cats)

(0.731, 0.998]                     250
(0.483, 0.731]                     250
(0.224, 0.483]                     250
(-0.0007000000000000001, 0.224]    250
dtype: int64

In [35]:
# 跟cut类似，也可以设定自定义的分位数
pd.qcut(data, [0, 0.1, 0.5,0.9, 1])

[(0.0868, 0.483], (0.483, 0.899], (0.0868, 0.483], (-0.0007000000000000001, 0.0868], (0.483, 0.899], ..., (0.483, 0.899], (0.0868, 0.483], (0.483, 0.899], (0.483, 0.899], (0.0868, 0.483]]
Length: 1000
Categories (4, interval[float64]): [(-0.0007000000000000001, 0.0868] < (0.0868, 0.483] < (0.483, 0.899] < (0.899, 0.998]]

In [36]:
# 检测和过滤异常值
# 异常值的过滤和变换运算在很大程度上其实就是数组运算
np.random.seed(12345)    # 设置随机数种子，保证第二次运行这个代码也是完全相同的结果，
data = DataFrame(np.random.randn(1000,4))
data.head()

Unnamed: 0,0,1,2,3
0,-0.204708,0.478943,-0.519439,-0.55573
1,1.965781,1.393406,0.092908,0.281746
2,0.769023,1.246435,1.007189,-1.296221
3,0.274992,0.228913,1.352917,0.886429
4,-2.001637,-0.371843,1.669025,-0.43857


In [37]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [38]:
# 找到某列中绝对值大小超过3的值
col = data[3]
col.head()

0   -0.555730
1    0.281746
2   -1.296221
3    0.886429
4   -0.438570
Name: 3, dtype: float64

In [39]:
col[np.abs(col)>3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

In [40]:
# 如果要选出全部含有超过3或-3的行，可以选用布尔型DataFrame以及any方法
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [41]:
# 下面的代码可以将值限制在区间-3到3以内：
data[np.abs(data)>3] = np.sign(data)*3 # sign是符号函数
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067623,0.068473,0.025153,-0.002081
std,0.995485,0.990253,1.003977,0.989736
min,-3.0,-3.0,-3.0,-3.0
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.0,2.653656,3.0,3.0


In [42]:
# 排列和随机采样 
# 利用numpy.random.permutation函数可以实现对series或DataFrame列的排列工作，通过需要排列的轴的长度
# 调用permutation，可产生一个表示新顺序的整数数组：
df = DataFrame(np.arange(5*4).reshape(5,4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [43]:
sampler = np.random.permutation(5)
sampler

array([1, 0, 2, 3, 4])

In [44]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [45]:
df.take(np.random.permutation(len(df))[:3])   # 如果不想用替换的方式选取随机子集，可以使用permutation返回的数组切下前k个元素

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
4,16,17,18,19


In [46]:
# 如果是要用替换的方式产生样本，最快的方式是通过np.random.randint得到一组随机整数
bag = np.array([5, 7, -1, 6, 4])
bag

array([ 5,  7, -1,  6,  4])

In [47]:
sampler = np.random.randint(0, len(bag), size=10)
sampler

array([4, 4, 2, 2, 2, 0, 3, 0, 4, 1])

In [48]:
drams = bag.take(sampler)
drams

array([ 4,  4, -1, -1, -1,  5,  6,  5,  4,  7])

In [49]:
# 计算指标/哑变量
# 将分类变量（categorical variable）转换为哑变量矩阵（dummy matrix）或指标矩阵（indicator matrix）
df = DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [50]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [51]:
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [52]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy    

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [53]:
# 下面介绍一种新的情况,如果DataFrame中的某行同属于多个分类，则事情就会有点复杂。
names = ['movie_id','title','genres']
movies = pd.read_csv('D:\\work_space_shareit\\study\\My_python_for_data_analyst\\data\\movielens\\movies.dat',sep='::',header=None,names=names,engine='python')
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [54]:
# 如果想要为每个genre添加指标变量的话，就需要做一些数据规整操作。
genre_iter = (set(x.split('|')) for x in movies.genres)
genre_iter

<generator object <genexpr> at 0x0000020777DCC930>

In [55]:
genres = sorted(set.union(*genre_iter))
genres 

['Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [62]:
# 接下来，迭代每一部电影并将dummies各行的项设置为1：（这部分没有调试成功）
""" 
for i,gen in enumerate(movies.genres):
    # print(i, gen)
    dummies.loc[i, gen.split('|')] = 1

# 然后与movies合并起来
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.ix[0]

# 对统计应用有用的秘诀是：结合get_dummies和诸如cut之类的离散化函数
values = np.random.rand(10)
values
bins = [0,0.2, 0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(values, bins))

"""

" \nfor i,gen in enumerate(movies.genres):\n    # print(i, gen)\n    dummies.loc[i, gen.split('|')] = 1\n\n\n"

In [63]:
# 字符串操作 切分
val = 'a,b,  guido'
val.split(',')    # 用法类似sql中的split

['a', 'b', '  guido']

In [66]:
# 字符串操作 strip（修剪空白符，或者是换行符）
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [56]:
#