# Series

In [2]:
import pandas as pd

## Simple series

In [3]:
pd.Series(["Matt", "Simon", "Jen"])

0     Matt
1    Simon
2      Jen
dtype: object

## Getting a value

In [4]:
series = pd.Series(["Matt", "Simon", "Jen"])
series[0]

'Matt'

## Getting the index of a value

In [5]:
series = pd.Series(["Matt", "Simon", "Jen"])
series[series == "Jen"].index[0]

2

### We can also swap the values/indeces and look it up that way

In [6]:
series = pd.Series(["Matt", "Simon", "Jen"])
lookup = pd.Series(series.index, index=series)
lookup["Jen"]

2

## Operators are performed by index

In [7]:
data1 = pd.Series([5, 2, 3, 7], index=['a', 'b', 'c', 'd'])
data2 = pd.Series([10, 11, 12, 13], index=['d', 'c', 'b', 'a'])

data1 + data2

a    18
b    14
c    14
d    17
dtype: int64

## Sorting

### Ascending

In [8]:
data1 = pd.Series([5, 2, 3,7], index=['a', 'b', 'c', 'd'])
data1.sort_values()

b    2
c    3
a    5
d    7
dtype: int64

### Descending

In [9]:
data1 = pd.Series([5, 2, 3,7], index=['a', 'b', 'c', 'd'])
data1.sort_values(ascending=False)

d    7
a    5
c    3
b    2
dtype: int64

## Value Counts

### Raw counts

In [10]:
data = pd.Series(["m", "f", "m", "f", "f"])
data.value_counts()

f    3
m    2
dtype: int64

### Percentages

In [11]:
data = pd.Series(["m", "f", "m", "f", "f"])
data.value_counts(normalize=True)

f    0.6
m    0.4
dtype: float64

### Determining the most common value

In [12]:
data = pd.Series(["m", "f", "m", "f", "f"])
data.mode()[0]

'f'

### Counting unique values

In [13]:
data = pd.Series(["m", "f", "m", "f", "f"])
data.nunique()

2

### Casting boolean to int

In [14]:
data = pd.Series([True, False, True])
data.astype(int)

0    1
1    0
2    1
dtype: int64

### Replacing strings

In [15]:
series = pd.Series(["10000", "12,000"])
series = series.str.replace(",", "").astype(float)
series

0    10000.0
1    12000.0
dtype: float64

### Determining the largest value

In [16]:
s = pd.Series([5, 10, 3, 20, 100, 48, 3, 2, 1000])
s.nlargest()

8    1000
4     100
5      48
3      20
1      10
dtype: int64

## Finding the indeces of the n largest values

In [28]:
s = pd.Series([2, 8, 10, 6, 1])
n = 3

A complicated way seen in a [DataCamp video](https://campus.datacamp.com/courses/feature-engineering-for-nlp-in-python/tf-idf-and-similarity-scores?ex=10):

In [18]:
indices = list(enumerate(s))
indices_sorted = sorted(indices, key=lambda x: x[1], reverse=True)
indices_sorted = indices_sorted[:n]
top_indices = [i[0] for i in indices_sorted]
top_indices

[2, 1, 3]

[StackOverflow solution 1](https://stackoverflow.com/a/6910672/156835):

In [19]:
list(series.argsort()[-n:][::-1])

[1, 0]

[StackOverflow solution 2](https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array#comment79177553_6910672) that handles n = 0:

In [20]:
list(s.argsort()[::-1][:n])

[2, 1, 3]

Or [a more readable solution](https://twitter.com/FlorianDreher/status/1305994131630587907):

In [31]:
list(s.sort_values(ascending=False).index[:n])

[2, 1, 3]

### Measuring percent changes

In [22]:
s = pd.Series([10, 12, 15, 30])
s.pct_change()

0     NaN
1    0.20
2    0.25
3    1.00
dtype: float64

### Measuring correltion

In [23]:
s1 = pd.Series([1, 2, 3, 4])
s2 = pd.Series([10, 20, 30, 40])

s1.corr(s2)

1.0