## Series

Series is a one-dimensional array containing axis label. It can be created via list, dictionary or external source

In [92]:
import pandas as pd
import numpy as np

Create a series via list and dictionary.  
pd.Series()

In [3]:
fruits = ["Apple", "Orange", "Kiwi", "Grape", "Blueberry", "Watermelon"]
f = pd.Series(fruits)
f

0         Apple
1        Orange
2          Kiwi
3         Grape
4     Blueberry
5    Watermelon
dtype: object

In [4]:
games = {'Monday':'Elden Ring','Tuesday':'FIFA','Wednesday':'Pokemon','Thursday':'Zelda'}
g = pd.Series(games)
g
#注意在使用dictionary时，key作为index label。与后面的dataframe不一样

Monday       Elden Ring
Tuesday            FIFA
Wednesday       Pokemon
Thursday          Zelda
dtype: object

## Series Attributes

In [21]:
for s in [g,f]:
    print(s.index)

# 当我们创建series时不输入index的信息，则pandas会默认以数字形式作为index label。可以在series创建时或者创建后赋予series lable的参数

Index(['Monday', 'Tuesday', 'Wednesday', 'Thursday'], dtype='object')
RangeIndex(start=0, stop=6, step=1)


In [27]:
f.index = ['a','b','c','d','e','f']
#index的长度必须和series的长度一致，否则报错
f

a         Apple
b        Orange
c          Kiwi
d         Grape
e     Blueberry
f    Watermelon
dtype: object

In [13]:
g.values

array(['Elden Ring', 'FIFA', 'Pokemon', 'Zelda'], dtype=object)

In [14]:
g.dtype
# 在dataframe等较复杂的数据结构中，也使用这个attributes去查看各个column的数据类型

dtype('O')

In [28]:
#在series中没有column的概念，只有整个series的名字的概念，同样可以在创建时或者创建后赋值
# series name会在由serie组成dataframe时变成对应的行或列的名字
g.name = 'My games'
g.name

'My games'

In [29]:
g

Monday       Elden Ring
Tuesday            FIFA
Wednesday       Pokemon
Thursday          Zelda
Name: My games, dtype: object

In [30]:
#pd.Series() 在创建过程中赋值name和index label的例子
pd.Series(fruits,index=['a','b','c','d','e','f'],name = 'My Fruit')

a         Apple
b        Orange
c          Kiwi
d         Grape
e     Blueberry
f    Watermelon
Name: My Fruit, dtype: object

## Series Methods

In [50]:
# use pd.read_csv() to import the sample data
# usecols 代表哪些列会被pandas import，等于subset of columns. 如果不特别说明，默认全部列会被import 
# index_col代表哪一列会被作为index label。注意index_col中的列必须出现在usecol中
fortune = pd.read_csv(r'C:\Users\wlj91\Desktop\Python\fortune1000.csv',usecols = ['Revenue','Company'],index_col = 'Company').squeeze('columns')
#fortune = pd.read_csv(r'C:\Users\wlj91\Desktop\Python\fortune1000.csv',usecols = ['Revenue','Company'],index_col = 'Company',squeeze = True)
#在较老的pandas版本中， squeeze =True 意味着将数据最终以series的形式呈现，如果没有这个parameter， 会以dataframe形式呈现。
#最新的版本中，在read_csv后面call .squeeze('columns')让pandas知道以series形式呈现数据。

#### The .head() and .tail() Methods  
- 可以快速查询 top 5 rows 和 bottom 5 rows  
- 括号内不传入数字默认为5， 如果想查询最前面的20行，则括号内输入20即可。

In [51]:
fortune.head()

Company
Walmart               482130
Exxon Mobil           246204
Apple                 233715
Berkshire Hathaway    210821
McKesson              181241
Name: Revenue, dtype: int64

In [52]:
fortune.tail()

Company
New York Community Bancorp    1902
Portland General Electric     1898
Portland General Electric     1898
Wendy’s                       1896
Briggs & Stratton             1895
Name: Revenue, dtype: int64

#### Descriptive Stats
- .describe() 会总结出常用的统计。 当然也可以专门用特定的method去计算特定的统计量

In [58]:
fortune.describe()

count      1000.000000
mean      13535.525000
std       28820.519135
min        1895.000000
25%        2898.250000
50%        5113.000000
75%       11386.000000
max      482130.000000
Name: Revenue, dtype: float64

In [53]:
fortune.sum()

13535525

In [54]:
fortune.mean()

13535.525

In [55]:
fortune.quantile(0.25)

2898.25

In [59]:
fortune.std()

28820.519134890506

In [57]:
fortune.min()

1895

In [115]:
#查询series有几行
fortune.shape

(1000,)

In [116]:
len(fortune)

1000

#### Sort the series
- sort_values()
- sort_index()

In [60]:
fortune.sort_values(ascending = True)
# 如果想要降序排列， 则输入ascending = False
# 注意目前的这个升序排列并没有在原series中生效，想要改变原来series的排序，必须输入 inplace = True

Company
Briggs & Stratton               1895
Wendy’s                         1896
Portland General Electric       1898
Portland General Electric       1898
New York Community Bancorp      1902
                               ...  
McKesson                      181241
Berkshire Hathaway            210821
Apple                         233715
Exxon Mobil                   246204
Walmart                       482130
Name: Revenue, Length: 1000, dtype: int64

In [62]:
fortune.sort_index(ascending = False,inplace = True)
#如果index为string，则按照字母表顺序排列，数字默认最后

In [63]:
fortune

Company
salesforce.com             6667
inVentiv Health            2321
iHeartMedia                6242
hhgregg                    2129
eBay                       9496
                          ...  
A.O. Smith                 2537
A. Schulman                2392
A-Mark Precious Metals     6070
99 Cents Only Stores       1999
3M                        30274
Name: Revenue, Length: 1000, dtype: int64

In [64]:
fortune.sort_values(ascending = False, inplace = True)
fortune

Company
Walmart                       482130
Exxon Mobil                   246204
Apple                         233715
Berkshire Hathaway            210821
McKesson                      181241
                               ...  
New York Community Bancorp      1902
Portland General Electric       1898
Portland General Electric       1898
Wendy’s                         1896
Briggs & Stratton               1895
Name: Revenue, Length: 1000, dtype: int64

#### Extract values from series
- by index position
- by index label

In [67]:
# by index position
fortune[2:5]

Company
Apple                 233715
Berkshire Hathaway    210821
McKesson              181241
Name: Revenue, dtype: int64

In [68]:
fortune[[0,2,4]]

Company
Walmart     482130
Apple       233715
McKesson    181241
Name: Revenue, dtype: int64

In [74]:
# by index label
fortune[['Walmart',"Wendy’s",'McKesson']]

Company
Walmart     482130
Wendy’s       1896
McKesson    181241
Name: Revenue, dtype: int64

#### Boolean Masks

In [80]:
# What if I only want to see the companies with revenue larger than 200K?
fortune[fortune>200000]
# 方括号内return的是一个series，只是value仅为True or False

Company
Walmart               482130
Exxon Mobil           246204
Apple                 233715
Berkshire Hathaway    210821
Name: Revenue, dtype: int64

In [117]:
fortune>200000

Company
Walmart                        True
Exxon Mobil                    True
Apple                          True
Berkshire Hathaway             True
McKesson                      False
                              ...  
New York Community Bancorp    False
Portland General Electric     False
Portland General Electric     False
Wendy’s                       False
Briggs & Stratton             False
Name: Revenue, Length: 1000, dtype: bool

In [83]:
# What if I only want to see the companies with revenue larger than 200K or revenues smaller than 2K?
fortune[(fortune>200000) | (fortune<2000)]
#务必注意在boolean mask 截取数据时，逻辑运算符号不可以是and 或者 or， 应该用 & 或者 |

Company
Walmart                              482130
Exxon Mobil                          246204
Apple                                233715
Berkshire Hathaway                   210821
99 Cents Only Stores                   1999
Roadrunner Transportation Systems      1995
Super Micro Computer                   1991
First Republic Bank                    1989
Hill-Rom Holdings                      1988
Providence Service                     1987
Allison Transmission Holdings          1986
Spire                                  1976
WPX Energy                             1958
Century Aluminum                       1950
Adams Resources & Energy               1944
Nuance Communications                  1931
Primoris Services                      1929
Schnitzer Steel Industries             1924
Delta Tucker Holdings                  1923
Hospitality Properties Trust           1922
Cenveo                                 1921
F5 Networks                            1920
BlueLinx Holdings       

#### Check if the data is unique
- is_unqiue
- nunique()
- unique()
- duplicated()

In [84]:
# 适用于快速查询整个series的value是否为unqiue
fortune.is_unique

False

In [86]:
#适用于查询unique的value一共有几个
fortune.nunique()

945

In [93]:
# 查询哪些value是unique的
fortune.unique()

array([482130, 246204, 233715, 210821, 181241, 157107, 153290, 152356,
       149558, 146801, 140389, 135962, 131620, 131118, 116199, 110359,
       109830, 107006, 103444, 103355, 102531, 101752, 101006,  96114,
        93580,  93056,  90033,  88519,  88275,  87169,  82461,  81824,
        79157,  78756,  75697,  74989,  74510,  73785,  70074,  69951,
        67702,  64566,  63491,  63056,  61047,  60337,  59074,  58363,
        58327,  57119,  55355,  54289,  52465,  49161,  48851,  48778,
        48681,  47453,  47011,  46132,  45891,  44294,  43591,  43026,
        42126,  41373,  40990,  40704,  40222,  40204,  39745,  39498,
        39450,  39208,  38581,  38243,  38226,  37897,  37876,  37864,
        35653,  35181,  34693,  34582,  34441,  32639,  32619,  31469,
        30945,  30935,  30601,  30380,  30274,  29636,  29447,  28987,
        28863,  28150,  28118,  28111,  27940,  27925,  27079,  27028,
        26800,  26794,  26528,  26380,  25413,  25281,  25146,  25098,
      

In [89]:
# 查询哪些value是duplicated
fortune[fortune.duplicated()]

Company
Ameren                            6098
Barnes & Noble                    6070
Regions Financial                 5674
SanDisk                           5565
Host Hotels & Resorts             5387
Energy Future Holdings            5370
NiSource                          5308
First American Financial          5176
Robert Half International         5095
Activision Blizzard               4664
Electronic Arts                   4515
Bemis                             4071
Ingles Markets                    3779
Towers Watson                     3645
Sprouts Farmers Market            3593
CalAtlantic Group                 3540
Meritor                           3506
Tailored Brands                   3496
TreeHouse Foods                   3206
American Water Works              3159
Noble Energy                      3133
American National Insurance       3018
Northern Tier Energy              3002
Service Corp. International       2986
Sabre                             2986
Green Plains     

In [96]:
# 查询哪些value是unique的，同时返回对应的index label
# '~' 代表NOT，会将true变成false， false变成true
fortune[~fortune.duplicated()]

Company
Walmart                       482130
Exxon Mobil                   246204
Apple                         233715
Berkshire Hathaway            210821
McKesson                      181241
                               ...  
EP Energy                       1908
New York Community Bancorp      1902
Portland General Electric       1898
Wendy’s                         1896
Briggs & Stratton               1895
Name: Revenue, Length: 945, dtype: int64

#### Check and fill in Null value
- isnull()
- dropna()
- fillna()


In [98]:
# let's revist the fruits list and add some null values
fruits = ["Apple", "Orange", "Kiwi", "Grape", "Blueberry",np.nan, "Watermelon"]
fruits_index = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
s= pd.Series(fruits,index = fruits_index)

In [99]:
s

Monday            Apple
Tuesday          Orange
Wednesday          Kiwi
Thursday          Grape
Friday        Blueberry
Saturday            NaN
Sunday       Watermelon
dtype: object

In [100]:
#return 类似 duplicated相同的boolean series
s.isnull()

Monday       False
Tuesday      False
Wednesday    False
Thursday     False
Friday       False
Saturday      True
Sunday       False
dtype: bool

In [109]:
#利用boolean的0和1的特点快速查询有几个是null value
s.isnull().sum()

1

In [102]:
# 将NA value从series中去除，注意如果想要在原series中生效，需要加入 inplace = True
s.dropna()

Monday            Apple
Tuesday          Orange
Wednesday          Kiwi
Thursday          Grape
Friday        Blueberry
Sunday       Watermelon
dtype: object

In [104]:
# 可以用fillna的方法将null value用某个值代替
s.fillna(value = 'Peach')

Monday            Apple
Tuesday          Orange
Wednesday          Kiwi
Thursday          Grape
Friday        Blueberry
Saturday          Peach
Sunday       Watermelon
dtype: object

In [105]:
# fillna的参数常见的还有method，包括‘ffill','bfill'
s.fillna(method='bfill')
#用下一个valid的值

Monday            Apple
Tuesday          Orange
Wednesday          Kiwi
Thursday          Grape
Friday        Blueberry
Saturday     Watermelon
Sunday       Watermelon
dtype: object

In [106]:
s.fillna(method = 'ffill')
#用上一个valid的值
# 所有fillna的语法想要在原series中生效，同样要加入inplace = True

Monday            Apple
Tuesday          Orange
Wednesday          Kiwi
Thursday          Grape
Friday        Blueberry
Saturday      Blueberry
Sunday       Watermelon
dtype: object

#### Set Series Value

In [107]:
#利用index找到想要替换的值，随后用等号赋值
s['Monday'] = 'Taro'

In [108]:
s

Monday             Taro
Tuesday          Orange
Wednesday          Kiwi
Thursday          Grape
Friday        Blueberry
Saturday            NaN
Sunday       Watermelon
dtype: object

In [110]:
s['Saturday'] = 'It was a null value'

In [111]:
s

Monday                      Taro
Tuesday                   Orange
Wednesday                   Kiwi
Thursday                   Grape
Friday                 Blueberry
Saturday     It was a null value
Sunday                Watermelon
dtype: object