# Intro to pandas

Pandas is a Python library, most likely one of the most popular library when it comes to Data Science.

In [27]:
# Importing libraries: import: [library name] as [how you want to reference it in the file].
import pandas as pd

# Now let's import a file to explore the Pandas library.
df = pd.read_csv("./BBAS3.SA.csv")

In [28]:
# The .head() shows us the top 5 rows in the dataset.
df.head()

# The .tail() shows us the bottom 5 rows.
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
241,2019-09-16,47.040001,47.560001,46.299999,46.650002,46.650002,8175500
242,2019-09-17,46.389999,47.450001,46.119999,47.299999,47.299999,6744200
243,2019-09-18,47.459999,48.25,47.16,48.02,48.02,8528300
244,2019-09-19,48.41,48.700001,46.77,46.77,46.77,9661800
245,2019-09-20,47.0,47.98,46.900002,47.509998,47.509998,15284500


In [29]:
# We can explore the different characteristics of the DataFrame.

# This tell us the type of data
type(df) 

# This tells us the types of data of the different series in the DataFrame
df.dtypes 

# This way we can summarize information as a table
df.info() 

# Will display the names of the series in the DataFrame
df.columns 

# We can store the names of the series as a list
name_of_cols = df.columns 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       246 non-null    object 
 1   Open       246 non-null    float64
 2   High       246 non-null    float64
 3   Low        246 non-null    float64
 4   Close      246 non-null    float64
 5   Adj Close  246 non-null    float64
 6   Volume     246 non-null    int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 13.6+ KB


In [30]:
# We can select a series
df['High']

# We can also find out what the type of a series is.
type(df['Date'])

# Remember the variable storing the list of series names? Let's print it.
print(f"Name of columns is :{name_of_cols}")

Name of columns is :Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')


In [31]:
# We can get more insightful data on the dataset, such as descriptive data like mean and median.
low_median = df.Low.median()
high_mean = df.High.mean()
print(f"Median is {low_median} and mean is {high_mean}")

Median is 47.955 and mean is 48.152113682926824


In [32]:
# If we want we can see all descriptives of a Series in one go.
df.High.describe()

count    246.000000
mean      48.152114
std        5.340762
min       29.760000
25%       45.450001
50%       49.270000
75%       52.169998
max       55.910000
Name: High, dtype: float64

In [33]:
# We can also work with dates
df.Date

0      2018-09-21
1      2018-09-24
2      2018-09-25
3      2018-09-26
4      2018-09-27
          ...    
241    2019-09-16
242    2019-09-17
243    2019-09-18
244    2019-09-19
245    2019-09-20
Name: Date, Length: 246, dtype: object

In [34]:
# Formatting dates is essential so we can later fetch day, month and year from the Series.
pd.Date = pd.to_datetime(df.Date,format='%Y-%m-%d')

pd.Date.dt.year # getting year

0      2018
1      2018
2      2018
3      2018
4      2018
       ... 
241    2019
242    2019
243    2019
244    2019
245    2019
Name: Date, Length: 246, dtype: int32

In [35]:
# Getting the day
pd.Date.dt.day 

0      21
1      24
2      25
3      26
4      27
       ..
241    16
242    17
243    18
244    19
245    20
Name: Date, Length: 246, dtype: int32

In [36]:
# Getting the month
pd.Date.dt.month

0      9
1      9
2      9
3      9
4      9
      ..
241    9
242    9
243    9
244    9
245    9
Name: Date, Length: 246, dtype: int32