In [2]:
import pandas as pd

In [3]:
# Create a simple dataframe
df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])

print(df)

# Change the naming of the columns and rows
df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]], index=['X', 'Y', 'Z'], columns=['A', 'B', 'C'])

print(df)

# Alternative way to change the naming of the columns and rows
df.columns = ['A', 'B', 'C']
df.index = ['X', 'Y', 'Z']

print(df)


   0  1  2
0  1  2  3
1  4  5  6
2  7  8  9
   A  B  C
X  1  2  3
Y  4  5  6
Z  7  8  9
   A  B  C
X  1  2  3
Y  4  5  6
Z  7  8  9


In [4]:
df.info() # Displays a summary of the DataFrame, including data types, non-null counts, and memory usage.

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, X to Z
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
dtypes: int64(3)
memory usage: 96.0+ bytes


In [5]:
df.describe() # Displays descriptive statistics (mean, median, quartiles, etc.) for numerical columns.

Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


In [6]:
df.nunique() # Shows the number of unique values of all the columns

A    3
B    3
C    3
dtype: int64

In [7]:
df['A'].unique() # Shows the unique values of the specified column

array([1, 4, 7])

In [8]:
df.shape # Shows as how many rows and cols are in this dataframe (rows, cols)

(3, 3)

In [9]:
df.size # Shows the number of elements in the dataframe

9

In [10]:
coffee = pd.read_csv('./warmup_data/coffee.csv') # Load a csv file

In [11]:
coffee.head() # Show the first 5 entries in the dataframe ( 5 is default )

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [12]:
coffee.head(10) # Show the first 10 entries in the dataframe

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [13]:
# The same applies to tail
coffee.tail()

Unnamed: 0,Day,Coffee Type,Units Sold
9,Friday,Latte,35
10,Saturday,Espresso,45
11,Saturday,Latte,35
12,Sunday,Espresso,45
13,Sunday,Latte,35


In [14]:
coffee.tail(10)

Unnamed: 0,Day,Coffee Type,Units Sold
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35
10,Saturday,Espresso,45
11,Saturday,Latte,35
12,Sunday,Espresso,45
13,Sunday,Latte,35


In [15]:
coffee.sample(10) # Show 10 random values in the dataframe, the values picked change each time this is executed

Unnamed: 0,Day,Coffee Type,Units Sold
10,Saturday,Espresso,45
7,Thursday,Latte,30
8,Friday,Espresso,45
3,Tuesday,Latte,20
13,Sunday,Latte,35
12,Sunday,Espresso,45
4,Wednesday,Espresso,35
6,Thursday,Espresso,40
5,Wednesday,Latte,25
1,Monday,Latte,15


In [16]:
coffee.sample(10, random_state=1) # Make it deterministic (Always shows the same random values)

Unnamed: 0,Day,Coffee Type,Units Sold
3,Tuesday,Latte,20
7,Thursday,Latte,30
6,Thursday,Espresso,40
2,Tuesday,Espresso,30
10,Saturday,Espresso,45
4,Wednesday,Espresso,35
1,Monday,Latte,15
12,Sunday,Espresso,45
0,Monday,Espresso,25
13,Sunday,Latte,35


In [17]:
coffee.loc[0, 'Day'] # Access specific values, filter by rows and columns. Use .loc[Rows, Columns]

'Monday'

In [18]:
coffee.loc[[0,3,6]] # Access specific rows

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
3,Tuesday,Latte,20
6,Thursday,Espresso,40


In [19]:
coffee.loc[5:9] # Can even use Python slicing notation

Unnamed: 0,Day,Coffee Type,Units Sold
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [20]:
coffee.loc[5:8, ['Day', 'Units Sold']] # Combine specific rows and columns

Unnamed: 0,Day,Units Sold
5,Wednesday,25
6,Thursday,40
7,Thursday,30
8,Friday,45


In [21]:
coffee.iloc[5:8, [0,2]] # Uses indexes values instead of naming values. Also the upper index is exclusive in the slicing notation

Unnamed: 0,Day,Units Sold
5,Wednesday,25
6,Thursday,40
7,Thursday,30


In [22]:
# Important: If your DataFrame has a non-numeric index (e.g., dates, strings like weekdays), 
# .iloc will work, but .loc will need the exact label names. 
# Trying to access rows via integer labels with .loc would result in an error if the labels aren't integers.

# Example ( Uncomment to run ) and make sure to reset the index values ( Re-run the read csv code for the coffee file )

# coffee.index = coffee['Day']
# coffee.loc[5:8] # This will result in an error

In [23]:
coffee.loc[1, 'Units Sold'] = 10 # Set the data in the specific row and column

coffee

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [24]:
coffee.loc[1:3, 'Units Sold'] = 10 # Sets the 'Units Sold' value to 10 for rows with index labels 1 through 3 (inclusive) using label-based indexing.
# In this DataFrame, the index is numerical, allowing both .loc (label-based) and .iloc (position-based) to work. If the index were non-numeric (e.g., 'Monday'), only .loc could be used with labels.
coffee

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,10
3,Tuesday,Latte,10
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [25]:
# at/iat: Used specifically for accessing or setting a single value, 
# and optimized for speed when doing so. It doesn't work on multiple values.
# 'at' uses label-based indexing, while 'iat' uses position-based indexing.
coffee.at[3, 'Units Sold'] = 12

coffee

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,10
3,Tuesday,Latte,12
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [26]:
coffee.iat[3, 2] = 10

coffee

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,10
3,Tuesday,Latte,10
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [27]:
coffee['Day'] # Access column by name. Can even grab multiple columns. More robust.

0        Monday
1        Monday
2       Tuesday
3       Tuesday
4     Wednesday
5     Wednesday
6      Thursday
7      Thursday
8        Friday
9        Friday
10     Saturday
11     Saturday
12       Sunday
13       Sunday
Name: Day, dtype: object

In [28]:
coffee.Day # Dot notation (only works if the column name is a valid Python identifier)

0        Monday
1        Monday
2       Tuesday
3       Tuesday
4     Wednesday
5     Wednesday
6      Thursday
7      Thursday
8        Friday
9        Friday
10     Saturday
11     Saturday
12       Sunday
13       Sunday
Name: Day, dtype: object

In [29]:
coffee.sort_values('Units Sold') # Sort the values of the specified column. Ascending order by default

Unnamed: 0,Day,Coffee Type,Units Sold
1,Monday,Latte,10
2,Tuesday,Espresso,10
3,Tuesday,Latte,10
0,Monday,Espresso,25
5,Wednesday,Latte,25
7,Thursday,Latte,30
4,Wednesday,Espresso,35
9,Friday,Latte,35
13,Sunday,Latte,35
11,Saturday,Latte,35


In [30]:
coffee.sort_values('Units Sold', ascending=False) # Sort the values in descending order

Unnamed: 0,Day,Coffee Type,Units Sold
10,Saturday,Espresso,45
8,Friday,Espresso,45
12,Sunday,Espresso,45
6,Thursday,Espresso,40
4,Wednesday,Espresso,35
11,Saturday,Latte,35
13,Sunday,Latte,35
9,Friday,Latte,35
7,Thursday,Latte,30
0,Monday,Espresso,25


In [31]:
coffee.sort_values(['Units Sold', 'Coffee Type'], ascending=False) # Sorts the DataFrame by 'Units Sold' in descending order. 
# If there are ties (i.e., rows with the same 'Units Sold' value), 
# it further sorts those rows by 'Coffee Type' in descending order.

Unnamed: 0,Day,Coffee Type,Units Sold
8,Friday,Espresso,45
10,Saturday,Espresso,45
12,Sunday,Espresso,45
6,Thursday,Espresso,40
9,Friday,Latte,35
11,Saturday,Latte,35
13,Sunday,Latte,35
4,Wednesday,Espresso,35
7,Thursday,Latte,30
5,Wednesday,Latte,25


In [32]:
coffee.sort_values(['Units Sold', 'Coffee Type'], ascending=[0,1]) # Sorts the DataFrame by 'Units Sold' in descending order (0 means descending).
# If there are ties in 'Units Sold', it sorts by 'Coffee Type' in ascending order (1 means ascending).

Unnamed: 0,Day,Coffee Type,Units Sold
8,Friday,Espresso,45
10,Saturday,Espresso,45
12,Sunday,Espresso,45
6,Thursday,Espresso,40
4,Wednesday,Espresso,35
9,Friday,Latte,35
11,Saturday,Latte,35
13,Sunday,Latte,35
7,Thursday,Latte,30
0,Monday,Espresso,25


In [33]:
# Iterates over each row in the 'coffee' DataFrame using the iterrows() method.
# For each row, it prints the index of the row, followed by the value in the 'Units Sold' column.
# Note: iterrows() returns each row as a Series, making it easy to access columns by label (e.g., row['Units Sold']).
# Keep in mind that iterating with iterrows() is slower than vectorized operations and should be used when necessary for row-wise processing.

for index, row in coffee.iterrows():
    print(index)
    print(row['Units Sold'])
    print('\n\n')

0
25



1
10



2
10



3
10



4
35



5
25



6
40



7
30



8
45



9
35



10
45



11
35



12
45



13
35





In [34]:
# Note:
#   - CSV: Easy to use and share, but inefficient in terms of storage and data
#          processing due to lack of compression and type preservation.
#
#   - Feather: Best for scenarios where speed of read/write operations is critical,
#              particularly in memory-constrained environments.
#
#   - Parquet: The go-to format for large-scale data storage and analytics, offering
#              both compression and efficient querying capabilities, making it ideal for long-term,
#              cost-efficient data storage.

results = pd.read_parquet('./data/results.parquet') # Read a parquet file

In [35]:
results.head()

Unnamed: 0,year,type,discipline,event,as,athlete_id,noc,team,place,tied,medal
0,1912.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,17.0,True,
1,1912.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jean Montariol,,False,
2,1920.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,32.0,True,
3,1920.0,Summer,Tennis,"Doubles, Mixed (Olympic)",Jean-François Blanchy,1,FRA,Jeanne Vaussard,8.0,True,
4,1920.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jacques Brugnon,4.0,False,


In [36]:
olympics_data = pd.read_excel('./data/olympics-data.xlsx') # Read xlsx file, takes longer time to load

In [37]:
olympics_data.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


In [38]:
olympics_data = pd.read_excel('./data/olympics-data.xlsx', sheet_name='results') # Read a specific sheet. Takes significantly more time

In [39]:
olympics_data.head()

Unnamed: 0,year,type,discipline,event,as,athlete_id,noc,team,place,tied,medal
0,1912.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,17.0,True,
1,1912.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jean Montariol,,False,
2,1920.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,32.0,True,
3,1920.0,Summer,Tennis,"Doubles, Mixed (Olympic)",Jean-François Blanchy,1,FRA,Jeanne Vaussard,8.0,True,
4,1920.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jacques Brugnon,4.0,False,


In [40]:
bios = pd.read_csv('./data/bios.csv')

In [41]:
bios.info() # We use this to identify the data types of columns, which helps us understand how to properly filter the data (e.g., knowing if a column is numerical, categorical, or a string).

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145500 entries, 0 to 145499
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   athlete_id    145500 non-null  int64  
 1   name          145500 non-null  object 
 2   born_date     143693 non-null  object 
 3   born_city     110908 non-null  object 
 4   born_region   110908 non-null  object 
 5   born_country  110908 non-null  object 
 6   NOC           145499 non-null  object 
 7   height_cm     106651 non-null  float64
 8   weight_kg     102070 non-null  float64
 9   died_date     33940 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 11.1+ MB


In [45]:
bios

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25
...,...,...,...,...,...,...,...,...,...,...
145495,149222,Polina Luchnikova,2002-01-30,Serov,Sverdlovsk,RUS,ROC,167.0,61.0,
145496,149223,Valeriya Merkusheva,1999-09-20,Moskva (Moscow),Moskva,RUS,ROC,168.0,65.0,
145497,149224,Yuliya Smirnova,1998-05-08,Kotlas,Arkhangelsk,RUS,ROC,163.0,55.0,
145498,149225,André Foussard,1899-05-19,Niort,Deux-Sèvres,FRA,France,166.0,,1986-03-18


In [44]:
bios.loc[bios['height_cm'] > 215, ['name', 'height_cm']] # Filter athletes by height.

Unnamed: 0,name,height_cm
5089,Viktor Pankrashkin,220.0
5583,Paulinho Villas Boas,217.0
5673,Gunther Behnke,221.0
5716,Uwe Blab,218.0
5781,Tommy Burleson,223.0
5796,Andy Campbell,218.0
6223,Lars Hansen,216.0
6270,Hu Zhangbao,216.0
6409,Sergey Kovalenko,216.0
6420,Jānis Krūmiņš,218.0


In [51]:
bios.loc[(bios['height_cm'] > 215) & (bios['born_country'] == 'GER')] # Filter all the German athletes with height > 215 cm.

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
5673,5696,Gunther Behnke,1963-01-19,Leverkusen,Nordrhein-Westfalen,GER,Germany,221.0,114.0,
5716,5739,Uwe Blab,1962-03-26,München (Munich),Bayern,GER,Germany West Germany,218.0,110.0,


In [52]:
bios.loc[bios['name'].str.contains('Petros')] # Find the athletes that have 'Petros' in their name. Case sensitive by default.

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
3911,3924,Petros Nazarbegian,1927-05-06,Tehran,Tehran,IRI,Islamic Republic of Iran,,,2015-11-13
15197,15299,Petros Leonidis,,,,,Greece,,,
22206,22372,Petros Manos,1871-04-07,Athina (Athens),Attiki,GRE,Greece,,,1918-04-04
30112,30342,Petros Persakis,1879-01-01,Athina (Athens),Attiki,GRE,Greece,,,
41402,41719,Petros Kyritsis,1953-10-15,,,,Cyprus,174.0,76.0,
42334,42657,Petros Pappas,1953-05-06,,,,Greece,172.0,68.0,
54247,54637,Stefanos-Petros Santa,1975-05-21,Cluj-Napoca,Cluj,ROU,Greece Romania,,,
58052,58460,Petros Bourntoulis,1969-07-27,,,,Greece,193.0,118.0,
58059,58467,Petros Galaktopoulos,1945-06-07,Athina (Athens),Attiki,GRE,Greece,172.0,74.0,
58094,58506,Petros Triantafyllidis,1947-01-01,Bad Berleburg,Nordrhein-Westfalen,GER,Greece,151.0,52.0,


In [53]:
bios.loc[bios['name'].str.contains('Petros|Leonidas')] # Can even use regex syntax for more powerful string filtering.

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
1321,1328,Leonidas Asprilla,1952-07-31,,,,Colombia,164.0,57.0,
3911,3924,Petros Nazarbegian,1927-05-06,Tehran,Tehran,IRI,Islamic Republic of Iran,,,2015-11-13
4429,4444,Leonidas Maleckis,1966-03-08,Vilnius,Vilnius,LTU,Lithuania,178.0,71.0,
8197,8241,Leonidas Njunwa,1952-08-02,,,,United Republic of Tanzania,168.0,71.0,
15197,15299,Petros Leonidis,,,,,Greece,,,
22206,22372,Petros Manos,1871-04-07,Athina (Athens),Attiki,GRE,Greece,,,1918-04-04
24365,24554,Leonidas Flores,1965-01-24,Palma,Guanacaste,CRC,Costa Rica,,,
30112,30342,Petros Persakis,1879-01-01,Athina (Athens),Attiki,GRE,Greece,,,
30114,30344,Leonidas Tsiklitiras,,,,,Greece,,,
41402,41719,Petros Kyritsis,1953-10-15,,,,Cyprus,174.0,76.0,


In [57]:
countries_i_am_interested_in = ['GRE', 'GER', 'GBR', 'USA']
bios.loc[bios['born_country'].isin(countries_i_am_interested_in) & bios['name'].str.startswith('Petros'), ['name', 'born_country']]

Unnamed: 0,name,born_country
22206,Petros Manos,GRE
30112,Petros Persakis,GRE
58059,Petros Galaktopoulos,GRE
58094,Petros Triantafyllidis,GER
58095,Petros Triantafyllidis,GRE


In [62]:
bios.query('born_country == "GRE" and height_cm >= 180') # .query can be useful for simpler, more SQL-like filtering, but it has limitations when dealing with more complex operations or non-standard column names.

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
174,175,Tasos Bavelas,1968-02-27,Athina (Athens),Attiki,GRE,Greece,180.0,76.0,
175,176,Konstantinos Efremoglou,1962-12-04,Athina (Athens),Attiki,GRE,Greece,187.0,74.0,
176,177,Georgios Kalovelonis,1959-08-23,Athina (Athens),Attiki,GRE,Greece,192.0,82.0,
3716,3728,Georgios Stefanopoulos,1962-03-31,Peristeri,Attiki,GRE,Greece,180.0,91.0,
5641,5664,Efthymios Bakatsias,1968-01-14,Athina (Athens),Attiki,GRE,Greece,196.0,85.0,
...,...,...,...,...,...,...,...,...,...,...
140030,143527,Konstantinos Gkiouvetsis,1999-11-19,Chania,Kriti,GRE,Greece,191.0,90.0,
140031,143528,Marios Kapotsis,1991-09-13,Chios,Voreio Aigaio,GRE,Greece,183.0,87.0,
140032,143529,Stylianos Argyropoulos Kanakakis,1996-08-02,Athina (Athens),Attiki,GRE,Greece,190.0,96.0,
143998,147662,Stefanos Tsitsipas,1998-08-12,Athina (Athens),Attiki,GRE,Greece,195.0,90.0,
