In [1]:
import pandas as pd
import numpy as np

### Series

It is a one-dimensional of data. It is like a single column in a table.

You can have different types of data in a series, but be carefull if you want
only numerical data to be able to do some calculations.

In [2]:
# Let's create a series
dogs_breed_series = pd.Series(['French poodle', 'Bulldog', 'Labrador retriever', 789, 'Dachshund'])

In [3]:
dogs_breed_series

0         French poodle
1               Bulldog
2    Labrador retriever
3                   789
4             Dachshund
dtype: object

In [4]:
dogs_weigth_series = pd.Series([3, 4, 67, 23, 39, 19, 10, 9])

In [5]:
dogs_weigth_series


0     3
1     4
2    67
3    23
4    39
5    19
6    10
7     9
dtype: int64

Notice here the dtype is an integer here, while on the previous one it was an object (string will be see as an object in pandas).

The indexes (0, 1, 2, 3, ...) will be automatically assigned. BUt you can change that, like this:

In [6]:

# We can assign different indexes
dogs_weigth_serie = pd.Series(
    [3, 4, 67, 23, 39, 19, 10, 9],
    index = ['Cookie', 'Biscuit', 'Pepper', 'Apollo', 'Ginger', 'Ruby',
             "Spark", 'Peach'])

In [7]:
dogs_weigth_serie

Cookie      3
Biscuit     4
Pepper     67
Apollo     23
Ginger     39
Ruby       19
Spark      10
Peach       9
dtype: int64

## Statistics

Let's calculate the mean, median and mode.

These are function to study central tendency.

In [8]:
dogs_weigth_serie.mean()

21.75

In [9]:
dogs_weigth_serie.median()

14.5

In [10]:
dogs_weigth_serie.mode()

0     3
1     4
2     9
3    10
4    19
5    23
6    39
7    67
dtype: int64

In [11]:
# If we want to study the spread let use the standard deviation and the variance
dogs_weigth_serie.std()

21.783020910791965

In [12]:
dogs_weigth_serie.var()

474.5

# Dataframes

Dataframes are two-dimensional structures (rows, cols).

It has operations that allow you to manipulate numerical tables and time series.

Dataframes will be like you see in excel, but it will be more powerful.

There are multiple ways to define a dataframe.

In [13]:
# You can defined the dataframe by defining each series
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data = d)

In [14]:
df, df.dtypes

(   col1  col2
 0     1     3
 1     2     4,
 col1    int64
 col2    int64
 dtype: object)

In [15]:
# You can define the type of all the columns at once
df = pd.DataFrame(data=d, dtype=np.int8)
df, df.dtypes

(   col1  col2
 0     1     3
 1     2     4,
 col1    int8
 col2    int8
 dtype: object)

In [16]:
# You can define in the same ways, but directly including a Series
d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
df = pd.DataFrame(data=d, index=[0, 1, 2, 3])
df

Unnamed: 0,col1,col2
0,0,
1,1,
2,2,2.0
3,3,3.0


In [17]:
# Building from a numpy ndarray
df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
    columns=['a', 'b', 'c'])
df2

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [18]:
# But the more practical approach is to build it from a list of dict
# This way you are sure that the data in each row is correct
df3 = pd.DataFrame.from_records(
    [{'points': 50, 'time': '5:00', 'year': 2010}, 
     {'points': 25, 'time': '6:00', 'month': "february"}, 
     {'points':90, 'time': '9:00', 'month': 'january'}, 
     {'points_h1':20, 'month': 'june'}]
)
df3

Unnamed: 0,points,time,year,month,points_h1
0,50.0,5:00,2010.0,,
1,25.0,6:00,,february,
2,90.0,9:00,,january,
3,,,,june,20.0


In [19]:
#dtypes to see the types of each columns
df3.dtypes

points       float64
time          object
year         float64
month         object
points_h1    float64
dtype: object

In [20]:
# columns to see the name of all the columns. Some times you have
# so many columns that you do not see all in a screen.
df3.columns

Index(['points', 'time', 'year', 'month', 'points_h1'], dtype='object')

In [21]:
#shape
df3.shape

(4, 5)

In [22]:
#info
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   points     3 non-null      float64
 1   time       3 non-null      object 
 2   year       1 non-null      float64
 3   month      3 non-null      object 
 4   points_h1  1 non-null      float64
dtypes: float64(3), object(2)
memory usage: 292.0+ bytes


In [23]:
#describe, to show some statistic 
df3.describe()

Unnamed: 0,points,year,points_h1
count,3.0,1.0,1.0
mean,55.0,2010.0,20.0
std,32.787193,,
min,25.0,2010.0,20.0
25%,37.5,2010.0,20.0
50%,50.0,2010.0,20.0
75%,70.0,2010.0,20.0
max,90.0,2010.0,20.0


In [24]:
#Head if you want to display just the first row, tail to display the last rows
df3.head(2)

Unnamed: 0,points,time,year,month,points_h1
0,50.0,5:00,2010.0,,
1,25.0,6:00,,february,


In [25]:
 df3.tail(3)

Unnamed: 0,points,time,year,month,points_h1
1,25.0,6:00,,february,
2,90.0,9:00,,january,
3,,,,june,20.0


In [26]:
df3[1:3]

Unnamed: 0,points,time,year,month,points_h1
1,25.0,6:00,,february,
2,90.0,9:00,,january,


In [27]:
# You can select data from one column
df3['points']

0    50.0
1    25.0
2    90.0
3     NaN
Name: points, dtype: float64

In [28]:
# from multiple columns
df3[['points', 'time']]

Unnamed: 0,points,time
0,50.0,5:00
1,25.0,6:00
2,90.0,9:00
3,,


In [29]:
# You can use some conditions to filter the dataframe
df3[df3['points'] < 55]


Unnamed: 0,points,time,year,month,points_h1
0,50.0,5:00,2010.0,,
1,25.0,6:00,,february,


In [30]:
df3[(df3['points'] < 55) & (df3['time'] == '6:00')]

Unnamed: 0,points,time,year,month,points_h1
1,25.0,6:00,,february,


In [31]:
df3[(df3['points'] < 55) | (df3['month'] == 'january')]

Unnamed: 0,points,time,year,month,points_h1
0,50.0,5:00,2010.0,,
1,25.0,6:00,,february,
2,90.0,9:00,,january,


In [32]:
# You can set a column as the index
df3.set_index('points')

Unnamed: 0_level_0,time,year,month,points_h1
points,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
50.0,5:00,2010.0,,
25.0,6:00,,february,
90.0,9:00,,january,
,,,june,20.0


In [34]:
# You can reset the index if you need to
df3 = df3.reset_index()
df3


Unnamed: 0,level_0,index,points,time,year,month,points_h1
0,0,0,50.0,5:00,2010.0,,
1,1,1,25.0,6:00,,february,
2,2,2,90.0,9:00,,january,
3,3,3,,,,june,20.0
