# Intro to Pandas

Pandas is a high-level data manipulation package which was built on top of Numpy. The key structures within pandas include Series and Dataframes.

## Series

A series is a one-dimensional array with axis labels (an index).

In [2]:
# Importing libraries and packages

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
# Creating a Series from a list
x = pd.Series([10,20,30,40,50])
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [4]:
# We can access different components separately:

In [5]:
#Accessing the index
x.index

RangeIndex(start=0, stop=5, step=1)

In [6]:
# Accessing values

In [7]:
x.values

array([10, 20, 30, 40, 50])

In [8]:
# Accessing the dtype

In [9]:
# A Series is an ndarray, thus it's homogenous and CANNOT store multiple dtypes

In [10]:
x.dtype

dtype('int64')

In [11]:
# Creating a Series with an Index
data = [450, 650, 870]
Sales = Series(data, index=["Don", "Mike", "Edwin"])
Sales

Don      450
Mike     650
Edwin    870
dtype: int64

In [12]:
# Check the type
type(Sales)

pandas.core.series.Series

In [13]:
# If we check the index of Sales, we will get the values, rather than the range. Because it's a string.
Sales.index

Index(['Don', 'Mike', 'Edwin'], dtype='object')

## Accessing Values

In [14]:
# You can access values using the index name
Sales["Don"]

450

In [15]:
#You can still use traditional indexing

In [16]:
Sales[0]

  Sales[0]


450

## Checking for conditions

In [17]:
# You can filter based on conditions.

In [18]:
Sales > 500
#This will usually return booleans

Don      False
Mike      True
Edwin     True
dtype: bool

In [19]:
# We can use these booleans

In [20]:
Sales[[False, True, True]]

Mike     650
Edwin    870
dtype: int64

In [21]:
# If we want to see values greater than 500, we can use booleans

In [22]:
Sales[Sales>500]

Mike     650
Edwin    870
dtype: int64

In [23]:
# False example

In [24]:
"Sally" in Sales

False

In [25]:
# What about this?

In [26]:
450 in Sales

False

In [27]:
#450 is not an index, it's a value. Thus it will return False.

# Working with Dictionaries

In [28]:
#Converting a Series to a dictionary
sales_dict = Sales.to_dict()

In [29]:
sales_dict

{'Don': 450, 'Mike': 650, 'Edwin': 870}

In [30]:
#Converting a dict to a series

In [31]:
sales_ser = Series(sales_dict)

In [32]:
sales_ser

Don      450
Mike     650
Edwin    870
dtype: int64

### Adding entries and working with NaN/null values

In [33]:
# We can create a new Series from an existing Series

In [34]:
#If we specify names in the index that were NOT there already, NaN values will be asigned
new_sales = Series(Sales, index=["Don", "Mike", "Sally", "Edwin", "Lucy"])

In [35]:
new_sales

Don      450.0
Mike     650.0
Sally      NaN
Edwin    870.0
Lucy       NaN
dtype: float64

In [36]:
# We can check if there are any NaN values in a Series
# For this we use Numpy!

In [37]:
np.isnan(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

In [38]:
#To check for null values, use Pandas!
pd.isnull(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

### Naming components

In [39]:
# Name an index
Sales.index.name = "Sales person"
Sales

Sales person
Don      450
Mike     650
Edwin    870
dtype: int64

In [40]:
# Naming a Series
Sales.name = "Total tv sales"
Sales

Sales person
Don      450
Mike     650
Edwin    870
Name: Total tv sales, dtype: int64

## DataFrames

DataFrames are two-dimensional, size-mutable, potentially heterogeneous tabular data structures...

## Creating a DataFrame

In [41]:
#Creating a DataFrame from a list
data = [["Adrian", 20], ["Bethany", 23], ["Chloe", 41]]

#When we create a DataFrame, we can specify what the column names are and the data type is

df = pd.DataFrame(data, columns=["Name", "Age"])
df

Unnamed: 0,Name,Age
0,Adrian,20
1,Bethany,23
2,Chloe,41


In [42]:
#Creating a DF from a dictionary 
# Dictionary with sample data
data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 25, 35],
    "City": ["London", "Manchester", "Birmingham"],
    "Occupation": ["Engineer", "Doctor", "Artist"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City,Occupation
0,Alice,30,London,Engineer
1,Bob,25,Manchester,Doctor
2,Charlie,35,Birmingham,Artist


In [43]:
# Custom indexes
custom_indexes = ["ID_101", "ID_102", "ID_103"]

In [44]:
df = pd.DataFrame(data, index=custom_indexes)
df

Unnamed: 0,Name,Age,City,Occupation
ID_101,Alice,30,London,Engineer
ID_102,Bob,25,Manchester,Doctor
ID_103,Charlie,35,Birmingham,Artist


In [45]:
# List of dictionaries
data = [
    {"Name": "Alice", "Age": 30, "City": "London"},
    {"Name": "Bob", "Age": 25, "City": "Manchester"},
    {"Name": "Charlie", "Age": 35, "City": "Birmingham"}
]

# Creating a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

      Name  Age        City
0    Alice   30      London
1      Bob   25  Manchester
2  Charlie   35  Birmingham


In [46]:
# Creating a Series
series = pd.Series([30, 25, 35], index=["Alice", "Bob", "Charlie"], name="Age")

df = series.to_frame()
df

Unnamed: 0,Age
Alice,30
Bob,25
Charlie,35


In [47]:
data = [
    {"Name": "Alice", "Age": 30, "City": "London"},
    {"Name": "Bob", "Age": 25, "City": "Manchester"},
    {"Name": "Charlie", "Age": 35, "City": "Birmingham"}
]

df = pd.DataFrame(data)

# New Series
occupations = pd.Series(["Engineer", "Doctor", "Artist"], name="Occupation")

# Adding the Series as a new column to the DataFrame
df["Occupation"] = occupations
df

Unnamed: 0,Name,Age,City,Occupation
0,Alice,30,London,Engineer
1,Bob,25,Manchester,Doctor
2,Charlie,35,Birmingham,Artist


In [48]:
# Original DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 25, 35],
    "City": ["London", "Manchester", "Birmingham"]
}
df = pd.DataFrame(data)

df = df.set_index("Name")
df

Unnamed: 0_level_0,Age,City
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,30,London
Bob,25,Manchester
Charlie,35,Birmingham


In [49]:
#How to fill missing values

# Sample DataFrame with missing values
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [30, None, 35, None],
    "City": ["London", None, "Birmingham", "Manchester"]
}

# Create the DataFrame
df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)
print("\n")

# 1. **Forward Fill (ffill)**
df_filled_ffill = df.fillna(method='ffill')

print("DataFrame with forward fill (ffill):")
print(df_filled_ffill)
print("\n")

# 2. **Backward Fill (bfill)**
df_filled_bfill = df.fillna(method='bfill')

print("DataFrame with backward fill (bfill):")
print(df_filled_bfill)

Original DataFrame:
      Name   Age        City
0    Alice  30.0      London
1      Bob   NaN        None
2  Charlie  35.0  Birmingham
3    David   NaN  Manchester


DataFrame with forward fill (ffill):
      Name   Age        City
0    Alice  30.0      London
1      Bob  30.0      London
2  Charlie  35.0  Birmingham
3    David  35.0  Manchester


DataFrame with backward fill (bfill):
      Name   Age        City
0    Alice  30.0      London
1      Bob  35.0  Birmingham
2  Charlie  35.0  Birmingham
3    David   NaN  Manchester


  df_filled_ffill = df.fillna(method='ffill')
  df_filled_bfill = df.fillna(method='bfill')


In [50]:
# Sample DataFrame with missing values
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [30, None, 35, None],
    "City": ["London", None, "Birmingham", "Manchester"]
}

df = pd.DataFrame(data)

# Forward Fill (ffill / pad)
df_filled_ffill = df.fillna(method='ffill')

print("Forward Fill (ffill / pad):")
print(df_filled_ffill)

Forward Fill (ffill / pad):
      Name   Age        City
0    Alice  30.0      London
1      Bob  30.0      London
2  Charlie  35.0  Birmingham
3    David  35.0  Manchester


  df_filled_ffill = df.fillna(method='ffill')


In [51]:
# Backward Fill (bfill)
df_filled_bfill = df.fillna(method='bfill')

print("Backward Fill (bfill):")
print(df_filled_bfill)

Backward Fill (bfill):
      Name   Age        City
0    Alice  30.0      London
1      Bob  35.0  Birmingham
2  Charlie  35.0  Birmingham
3    David   NaN  Manchester


  df_filled_bfill = df.fillna(method='bfill')


In [52]:
# Forward Fill using 'pad' (same as 'ffill')
df_filled_pad = df.fillna(method='pad')

print("Forward Fill (pad):")
print(df_filled_pad)

Forward Fill (pad):
      Name   Age        City
0    Alice  30.0      London
1      Bob  30.0      London
2  Charlie  35.0  Birmingham
3    David  35.0  Manchester


  df_filled_pad = df.fillna(method='pad')


In [53]:
#Interpolate
# Sample DataFrame with missing values (NaN)
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [30, None, 35, None]
}

df = pd.DataFrame(data)

# Interpolating to fill missing values in the 'Age' column
df["Age"] = df["Age"].interpolate()

print(df)

      Name   Age
0    Alice  30.0
1      Bob  32.5
2  Charlie  35.0
3    David  35.0


In [54]:
#Drop rows
# Sample DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [30, 25, 35, 40],
    "City": ["London", "Manchester", "Birmingham", "London"]
}

df = pd.DataFrame(data)

# Drop row with index 1 (Bob)
df_dropped_row = df.drop(1)

print(df_dropped_row)

      Name  Age        City
0    Alice   30      London
2  Charlie   35  Birmingham
3    David   40      London


In [55]:
#Drop column
# Drop column 'City'
df_dropped_column = df.drop("City", axis=1)

print(df_dropped_column)

      Name  Age
0    Alice   30
1      Bob   25
2  Charlie   35
3    David   40


In [56]:
#Drop rows based on a threshold
# Sample DataFrame with missing values (NaN)
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [30, None, None, 40],
    "City": [None, "Manchester", "Birmingham", "London"]
}

df = pd.DataFrame(data)

# Drop rows where there are fewer than 2 non-null values
df_dropped_rows = df.dropna(thresh=2)

print(df_dropped_rows)

      Name   Age        City
0    Alice  30.0        None
1      Bob   NaN  Manchester
2  Charlie   NaN  Birmingham
3    David  40.0      London


In [57]:
#Drop based on an index
# Sample DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [30, 25, 35, 40],
    "City": ["London", "Manchester", "Birmingham", "London"]
}

df = pd.DataFrame(data)

# Drop row with index 1 (Bob) and index 3 (David)
df_dropped_rows = df.drop([1, 3])

print(df_dropped_rows)

      Name  Age        City
0    Alice   30      London
2  Charlie   35  Birmingham


In [58]:
#Drop duplicate rows
# Sample DataFrame with duplicate rows
data = {
    "Name": ["Alice", "Bob", "Charlie", "Alice", "David", "Bob"],
    "Age": [30, 25, 35, 30, 40, 25],
    "City": ["London", "Manchester", "Birmingham", "London", "London", "Manchester"]
}

df = pd.DataFrame(data)

# Remove duplicate rows
df_no_duplicates = df.drop_duplicates()

print(df_no_duplicates)

      Name  Age        City
0    Alice   30      London
1      Bob   25  Manchester
2  Charlie   35  Birmingham
4    David   40      London


In [59]:
#Find duplicates
# Creating a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'David', 'Charlie'],
    'Age': [25, 30, 35, 25, 40, 35],
    'City': ['London', 'Paris', 'Berlin', 'London', 'Madrid', 'Berlin']
}

df = pd.DataFrame(data)

# Finding duplicate rows
duplicates = df[df.duplicated()]

print("Duplicate Rows:\n", duplicates)

Duplicate Rows:
       Name  Age    City
3    Alice   25  London
5  Charlie   35  Berlin


In [None]:
#Selecting an entire column
# Creating a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['London', 'Paris', 'Berlin']
}

df = pd.DataFrame(data)

# Selecting the "Age" column as a Series
age_series = df['Age']
print(type(age_series))  # Output: <class 'pandas.core.series.Series'>

# Selecting the "Age" column as a DataFrame
age_dataframe = df[['Age']]
print(type(age_dataframe))  # Output: <class 'pandas.core.frame.DataFrame'>

In [60]:
#iloc, what is it and what can you do with it?
# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['London', 'Paris', 'Berlin', 'Madrid', 'Rome']
}

df = pd.DataFrame(data)

# Select the second row
print(df.iloc[1])  

# Select all rows but only the "Age" column
print(df.iloc[:, 1])  

# Select the first three rows and first two columns
print(df.iloc[0:3, 0:2]) 

Name      Bob
Age        30
City    Paris
Name: 1, dtype: object
0    25
1    30
2    35
3    40
4    45
Name: Age, dtype: int64
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


In [61]:
#What about loc?
# Sample DataFrame with custom index
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['London', 'Paris', 'Berlin', 'Madrid', 'Rome']
}

df = pd.DataFrame(data, index=['a', 'b', 'c', 'd', 'e'])  # Custom index

# Select row 'c'
print(df.loc['c'])

# Select all rows but only 'Age' column
print(df.loc[:, 'Age'])

# Select specific rows and columns
print(df.loc[['a', 'c', 'e'], ['Name', 'City']])

Name    Charlie
Age          35
City     Berlin
Name: c, dtype: object
a    25
b    30
c    35
d    40
e    45
Name: Age, dtype: int64
      Name    City
a    Alice  London
c  Charlie  Berlin
e      Eve    Rome


In [62]:
#How can you select specific info?
# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['London', 'Paris', 'Berlin', 'Madrid', 'Rome']
}

df = pd.DataFrame(data)

# Filter where Age > 30
filtered_df = df[df['Age'] > 30]
print(filtered_df)

      Name  Age    City
2  Charlie   35  Berlin
3    David   40  Madrid
4      Eve   45    Rome


In [63]:
#How to sort
# Sample DataFrame
data = {
    'Name': ['Charlie', 'Alice', 'Bob', 'Eve', 'David'],
    'Age': [35, 25, 30, 45, 40],
    'City': ['Berlin', 'London', 'Paris', 'Rome', 'Madrid']
}

df = pd.DataFrame(data)

# Sort by Age in ascending order
print(df.sort_values(by='Age'))

# Sort by Age in descending order
print(df.sort_values(by='Age', ascending=False))

# Sort by Age and then by Name
print(df.sort_values(by=['Age', 'Name']))

      Name  Age    City
1    Alice   25  London
2      Bob   30   Paris
0  Charlie   35  Berlin
4    David   40  Madrid
3      Eve   45    Rome
      Name  Age    City
3      Eve   45    Rome
4    David   40  Madrid
0  Charlie   35  Berlin
2      Bob   30   Paris
1    Alice   25  London
      Name  Age    City
1    Alice   25  London
2      Bob   30   Paris
0  Charlie   35  Berlin
4    David   40  Madrid
3      Eve   45    Rome


In [64]:
#How to rank
# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 35, 40]
}

df = pd.DataFrame(data)

# Rank by Age (default: ascending, average for ties)
df['Rank'] = df['Age'].rank()

# Rank in descending order
df['Rank_Desc'] = df['Age'].rank(ascending=False)

# Rank using 'min' method (lowest rank for ties)
df['Rank_Min'] = df['Age'].rank(method='min')

print(df)

      Name  Age  Rank  Rank_Desc  Rank_Min
0    Alice   25   1.0        5.0       1.0
1      Bob   30   2.0        4.0       2.0
2  Charlie   35   3.5        2.5       3.0
3    David   35   3.5        2.5       3.0
4      Eve   40   5.0        1.0       5.0


In [65]:
#What is describe and how can it be used?
# Sample DataFrame
data = {
    'Age': [25, 30, 35, 40, 45],
    'Salary': [30000, 40000, 50000, 60000, 70000]
}

df = pd.DataFrame(data)

# Get summary statistics
print(df.describe())

             Age        Salary
count   5.000000      5.000000
mean   35.000000  50000.000000
std     7.905694  15811.388301
min    25.000000  30000.000000
25%    30.000000  40000.000000
50%    35.000000  50000.000000
75%    40.000000  60000.000000
max    45.000000  70000.000000


In [66]:
#Other summary statistics
# .count()	Number of non-null values
# .sum()	Sum of values
# .mean()	Mean (average)
# .median()	Median (50th percentile)
# .std()	Standard deviation
# .var()	Variance
# .min()	Minimum value
# .max()	Maximum value
# .idxmin()	Index of minimum value
# .idxmax()	Index of maximum value

In [None]:
#Index hierarchy
# Creating MultiIndex from tuples
index = [('A', 1), ('A', 2), ('B', 1), ('B', 2)]
columns = ['Value']

df = pd.DataFrame([10, 20, 30, 40], index=pd.MultiIndex.from_tuples(index, names=['Letter', 'Number']), columns=columns)

print(df)