# Pandas Series

In [3]:
import pandas as pd
titanic = pd.read_csv("titanic.csv")
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.2500,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,S,
3,1,1,female,35.0,1,0,53.1000,S,C
4,0,3,male,35.0,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,
887,1,1,female,19.0,0,0,30.0000,S,B
888,0,3,female,,1,2,23.4500,S,
889,1,1,male,26.0,0,0,30.0000,C,C


In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [7]:
titanic["age"] #these elements are labelled with the same index labels in the data frame

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [9]:
titanic["age"].equals(titanic.age)

True

In [11]:
age = titanic["age"]

In [15]:
# PANDAS SERIES AND DATAFRAMES SHARE MOST OF THE METHODS AND ATTRIBUTES!!

In [13]:
age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

In [19]:
age.dtype #data type of the series

dtype('float64')

In [23]:
age.shape

(891,)

In [25]:
len(age)

891

In [29]:
age.index

RangeIndex(start=0, stop=891, step=1)

In [33]:
age.to_frame() #this converts a pandas series into a pandas dataframe with only one column!

Unnamed: 0,age
0,22.0
1,38.0
2,26.0
3,35.0
4,35.0
...,...
886,27.0
887,19.0
888,
889,26.0


## Analysing Numerical Series

In [37]:
age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

In [39]:
type(age)

pandas.core.series.Series

In [41]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [51]:
age.count() #does not include missing values 

714

In [53]:
age.size #includes missing values

891

In [55]:
age.sum() #pandas will ignore N/A when calculating the sum

21205.17

In [59]:
sum(age) #built in python method doesn't know how to handle N/A

nan

In [61]:
age.mean()

29.69911764705882

In [65]:
age.median()

28.0

In [67]:
age.std()

14.526497332334044

In [83]:
age.unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [79]:
len(age.unique()) #THIS RETURNS THE NUMBER OF UNIQUE VALUES IN DATA SERIES!!

89

In [89]:
age.value_counts() #we have 30 passengers at age 30

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: age, Length: 88, dtype: int64

In [93]:
age.value_counts(sort = False) #by default it sorts by count, but if we turn off sort it just keeps it as it arrives

22.00    27
38.00    11
26.00    18
35.00    18
54.00     8
         ..
0.67      1
30.50     2
0.42      1
34.50     1
74.00     1
Name: age, Length: 88, dtype: int64

In [97]:
age.value_counts(dropna = False)

NaN      177
24.00     30
22.00     27
18.00     26
28.00     25
        ... 
36.50      1
55.50      1
0.92       1
23.50      1
74.00      1
Name: age, Length: 89, dtype: int64

In [99]:
age.value_counts(ascending = True) #now we get less frequent to most frequent

74.0     1
14.5     1
70.5     1
12.0     1
36.5     1
        ..
30.0    25
19.0    25
18.0    26
22.0    27
24.0    30
Name: age, Length: 88, dtype: int64

In [101]:
age.value_counts(normalize = True)

24.00    0.042017
22.00    0.037815
18.00    0.036415
19.00    0.035014
28.00    0.035014
           ...   
36.50    0.001401
55.50    0.001401
0.92     0.001401
23.50    0.001401
74.00    0.001401
Name: age, Length: 88, dtype: float64

In [138]:
#lets say we want to get the most common age! then we could do...
age.value_counts(sort = True).index[0]
#then we print the first index

24.0

In [103]:
age.value_counts(bins = 5)

(16.336, 32.252]    346
(32.252, 48.168]    188
(0.339, 16.336]     100
(48.168, 64.084]     69
(64.084, 80.0]       11
Name: age, dtype: int64

In [108]:
type(age.value_counts(normalize = True))
#NOTE THAT THESE RETURN SERIES AS WELL! 

pandas.core.series.Series

In [114]:
series = age.value_counts(bins = 5)
series.index

IntervalIndex([(16.336, 32.252], (32.252, 48.168], (0.339, 16.336], (48.168, 64.084], (64.084, 80.0]], dtype='interval[float64, right]')

## Series have a single index because they are one-dimensional

![image.png](attachment:7ba989b6-0d15-4267-84f8-1354999e99ee.png)

## DataFrames have a row index and a column label (which are like an index for columns)

![image.png](attachment:f1048c47-5a02-43c0-94cf-e6945860c9fd.png)

## Analysing non-numerical Series

In [141]:
summer = pd.read_csv("summer.csv")
summer.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [145]:
summer.info() #only missing values in country

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31165 entries, 0 to 31164
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        31165 non-null  int64 
 1   City        31165 non-null  object
 2   Sport       31165 non-null  object
 3   Discipline  31165 non-null  object
 4   Athlete     31165 non-null  object
 5   Country     31161 non-null  object
 6   Gender      31165 non-null  object
 7   Event       31165 non-null  object
 8   Medal       31165 non-null  object
dtypes: int64(1), object(8)
memory usage: 2.1+ MB


In [151]:
athlete = summer["Athlete"]
type(athlete)

pandas.core.series.Series

In [155]:
#Note that the describe method gives different result than numerical series!!
#cause now we are working with non numerical data
athlete.describe()

count               31165
unique              22762
top       PHELPS, Michael
freq                   22
Name: Athlete, dtype: object

In [157]:
#this shows us that many athletes won more than 1 medal

In [161]:
athlete.min() #this will now return based on alphabetical 

'AABYE, Edgar'

In [165]:
len(athlete.unique())

22762

In [169]:
athlete.nunique()

22762

In [175]:
athlete.value_counts(normalize = True)

PHELPS, Michael          0.000706
LATYNINA, Larisa         0.000578
ANDRIANOV, Nikolay       0.000481
ONO, Takashi             0.000417
MANGIAROTTI, Edoardo     0.000417
                           ...   
ZAKA, Uddin              0.000032
ZAFAR, Hayat             0.000032
MUHAMMAD, Rashid         0.000032
MANNA, Muhammad Afzal    0.000032
LIDBERG, Jimmy           0.000032
Name: Athlete, Length: 22762, dtype: float64

## The Copy Method

In [216]:
titanic = pd.read_csv("titanic.csv")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [218]:
age = titanic.age

In [220]:
#what if we want to play around with the age column but we don't want
#to modify the original dataframe?

In [222]:
age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

In [224]:
age.iloc[2] = 29

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age.iloc[2] = 29


In [226]:
age.head()

0    22.0
1    38.0
2    29.0
3    35.0
4    35.0
Name: age, dtype: float64

In [228]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,29.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [230]:
#NOTE HOW EVEN THOUGH WE CHANGED JUST A SLICE OF THE DATAFRAME 
#IT ACTUALLY CHANGED THE WHOLE DATAFRAME!!! 26-->29
#TO FIX THIS WE NEED TO COPY!

In [232]:
#before "copy" the object in age_copy was not independent from the dataframe
age_copy = titanic.age.copy() 
age_copy.head()

0    22.0
1    38.0
2    29.0
3    35.0
4    35.0
Name: age, dtype: float64

In [234]:
age_copy.iloc[2] = 22
age_copy.head()

0    22.0
1    38.0
2    22.0
3    35.0
4    35.0
Name: age, dtype: float64

In [236]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,29.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [238]:
#NOTICE THAT IT DIDNT CHANGE!!

## Sorting data series

In [241]:
dic = {1:10, 3:35, 2:6, 4:36, 5:2, 6:0, 7:None}
dic

{1: 10, 3: 35, 2: 6, 4: 36, 5: 2, 6: 0, 7: None}

In [243]:
sales = pd.Series(dic)
sales

1    10.0
3    35.0
2     6.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [247]:
#sort the index!
sales.sort_index(ascending = True)

1    10.0
2     6.0
3    35.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [249]:
#notice how sales is now still not sorted.. Why? because we didn't use
#inplace = True
sales.sort_index(ascending = True, inplace = True)

In [251]:
sales

1    10.0
2     6.0
3    35.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [257]:
#we can also sort our series by values
sales.sort_values(inplace = True, ascending = False, na_position = "first")
sales

7     NaN
4    36.0
3    35.0
1    10.0
2     6.0
5     2.0
6     0.0
dtype: float64