In [2]:
import pandas as pd

In [3]:
# Create a simple dataframe
df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])

print(df)

# Change the naming of the columns and rows
df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]], index=['X', 'Y', 'Z'], columns=['A', 'B', 'C'])

print(df)

# Alternative way to change the naming of the columns and rows
df.columns = ['A', 'B', 'C']
df.index = ['X', 'Y', 'Z']

print(df)


   0  1  2
0  1  2  3
1  4  5  6
2  7  8  9
   A  B  C
X  1  2  3
Y  4  5  6
Z  7  8  9
   A  B  C
X  1  2  3
Y  4  5  6
Z  7  8  9


In [4]:
df.info() # Displays a summary of the DataFrame, including data types, non-null counts, and memory usage.

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, X to Z
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
dtypes: int64(3)
memory usage: 96.0+ bytes


In [5]:
df.describe() # Displays descriptive statistics (mean, median, quartiles, etc.) for numerical columns.

Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


In [6]:
df.nunique() # Shows the number of unique values of all the columns

A    3
B    3
C    3
dtype: int64

In [7]:
df['A'].unique() # Shows the unique values of the specified column

array([1, 4, 7])

In [8]:
df.shape # Shows as how many rows and cols are in this dataframe (rows, cols)

(3, 3)

In [9]:
df.size # Shows the number of elements in the dataframe

9

In [10]:
coffee = pd.read_csv('./warmup_data/coffee.csv') # Load a csv file

In [11]:
coffee.head() # Show the first 5 entries in the dataframe ( 5 is default )

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [12]:
coffee.head(10) # Show the first 10 entries in the dataframe

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [13]:
# The same applies to tail
coffee.tail()

Unnamed: 0,Day,Coffee Type,Units Sold
9,Friday,Latte,35
10,Saturday,Espresso,45
11,Saturday,Latte,35
12,Sunday,Espresso,45
13,Sunday,Latte,35


In [14]:
coffee.tail(10)

Unnamed: 0,Day,Coffee Type,Units Sold
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35
10,Saturday,Espresso,45
11,Saturday,Latte,35
12,Sunday,Espresso,45
13,Sunday,Latte,35


In [15]:
coffee.sample(10) # Show 10 random values in the dataframe, the values picked change each time this is executed

Unnamed: 0,Day,Coffee Type,Units Sold
13,Sunday,Latte,35
3,Tuesday,Latte,20
9,Friday,Latte,35
8,Friday,Espresso,45
11,Saturday,Latte,35
12,Sunday,Espresso,45
6,Thursday,Espresso,40
1,Monday,Latte,15
4,Wednesday,Espresso,35
5,Wednesday,Latte,25


In [16]:
coffee.sample(10, random_state=1) # Make it deterministic (Always shows the same random values)

Unnamed: 0,Day,Coffee Type,Units Sold
3,Tuesday,Latte,20
7,Thursday,Latte,30
6,Thursday,Espresso,40
2,Tuesday,Espresso,30
10,Saturday,Espresso,45
4,Wednesday,Espresso,35
1,Monday,Latte,15
12,Sunday,Espresso,45
0,Monday,Espresso,25
13,Sunday,Latte,35


In [17]:
coffee.loc[0, 'Day'] # Access specific values, filter by rows and columns. Use .loc[Rows, Columns]

'Monday'

In [18]:
coffee.loc[[0,3,6]] # Access specific rows

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
3,Tuesday,Latte,20
6,Thursday,Espresso,40


In [19]:
coffee.loc[5:9] # Can even use Python slicing notation

Unnamed: 0,Day,Coffee Type,Units Sold
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [20]:
coffee.loc[5:8, ['Day', 'Units Sold']] # Combine specific rows and columns

Unnamed: 0,Day,Units Sold
5,Wednesday,25
6,Thursday,40
7,Thursday,30
8,Friday,45


In [21]:
coffee.iloc[5:8, [0,2]] # Uses indexes values instead of naming values. Also the upper index is exclusive in the slicing notation

Unnamed: 0,Day,Units Sold
5,Wednesday,25
6,Thursday,40
7,Thursday,30


In [22]:
# Important: If your DataFrame has a non-numeric index (e.g., dates, strings like weekdays), 
# .iloc will work, but .loc will need the exact label names. 
# Trying to access rows via integer labels with .loc would result in an error if the labels aren't integers.

# Example ( Uncomment to run ) and make sure to reset the index values ( Re-run the read csv code for the coffee file )

# coffee.index = coffee['Day']
# coffee.loc[5:8] # This will result in an error

In [23]:
coffee.loc[1, 'Units Sold'] = 10 # Set the data in the specific row and column

coffee

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [24]:
coffee.loc[1:3, 'Units Sold'] = 10 # Sets the 'Units Sold' value to 10 for rows with index labels 1 through 3 (inclusive) using label-based indexing.
# In this DataFrame, the index is numerical, allowing both .loc (label-based) and .iloc (position-based) to work. If the index were non-numeric (e.g., 'Monday'), only .loc could be used with labels.
coffee

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,10
3,Tuesday,Latte,10
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [25]:
# at/iat: Used specifically for accessing or setting a single value, 
# and optimized for speed when doing so. It doesn't work on multiple values.
# 'at' uses label-based indexing, while 'iat' uses position-based indexing.
coffee.at[3, 'Units Sold'] = 12

coffee

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,10
3,Tuesday,Latte,12
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [26]:
coffee.iat[3, 2] = 10

coffee

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,10
3,Tuesday,Latte,10
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [27]:
coffee['Day'] # Access column by name. Can even grab multiple columns. More robust.

0        Monday
1        Monday
2       Tuesday
3       Tuesday
4     Wednesday
5     Wednesday
6      Thursday
7      Thursday
8        Friday
9        Friday
10     Saturday
11     Saturday
12       Sunday
13       Sunday
Name: Day, dtype: object

In [35]:
coffee.Day # Dot notation (only works if the column name is a valid Python identifier)

0        Monday
1        Monday
2       Tuesday
3       Tuesday
4     Wednesday
5     Wednesday
6      Thursday
7      Thursday
8        Friday
9        Friday
10     Saturday
11     Saturday
12       Sunday
13       Sunday
Name: Day, dtype: object

In [28]:
# Note:
#   - CSV: Easy to use and share, but inefficient in terms of storage and data
#          processing due to lack of compression and type preservation.
#
#   - Feather: Best for scenarios where speed of read/write operations is critical,
#              particularly in memory-constrained environments.
#
#   - Parquet: The go-to format for large-scale data storage and analytics, offering
#              both compression and efficient querying capabilities, making it ideal for long-term,
#              cost-efficient data storage.

results = pd.read_parquet('./data/results.parquet') # Read a parquet file

In [29]:
results.head()

Unnamed: 0,year,type,discipline,event,as,athlete_id,noc,team,place,tied,medal
0,1912.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,17.0,True,
1,1912.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jean Montariol,,False,
2,1920.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,32.0,True,
3,1920.0,Summer,Tennis,"Doubles, Mixed (Olympic)",Jean-François Blanchy,1,FRA,Jeanne Vaussard,8.0,True,
4,1920.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jacques Brugnon,4.0,False,


In [30]:
olympics_data = pd.read_excel('./data/olympics-data.xlsx') # Read xlsx file, takes longer time to load

In [31]:
olympics_data.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


In [32]:
olympics_data = pd.read_excel('./data/olympics-data.xlsx', sheet_name='results') # Read a specific sheet. Takes significantly more time

In [33]:
olympics_data.head()

Unnamed: 0,year,type,discipline,event,as,athlete_id,noc,team,place,tied,medal
0,1912.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,17.0,True,
1,1912.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jean Montariol,,False,
2,1920.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,32.0,True,
3,1920.0,Summer,Tennis,"Doubles, Mixed (Olympic)",Jean-François Blanchy,1,FRA,Jeanne Vaussard,8.0,True,
4,1920.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jacques Brugnon,4.0,False,


In [34]:
bios = pd.read_csv('./data/bios.csv')