# Intro to Pandas

In [9]:
# Importing libraries and packages
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

#import sys
#!{sys.executable} -m pip install pandas
#!{sys.executable} -m pip install pandas

In [10]:
data = [450, 650, 870]
Sales = Series(data, index = ["Don", "Mike", "Edwin"])
Sales

Don      450
Mike     650
Edwin    870
dtype: int64

In [6]:
type(Sales)

pandas.core.series.Series

In [7]:
# If we check the index of Sales, we will get the values, rather than the range. Because it's a string.
Sales.index

Index(['Don', 'Mike', 'Edwin'], dtype='object')

## Accessing values

In [8]:
# You can access values using the index name
Sales["Don"]

np.int64(450)

In [9]:
# You can still use traditional indexing
Sales[0]

  Sales[0]


np.int64(450)

## Checking for conditions

In [11]:
# You can filter based on conditions
Sales > 500
# Returns booleans

Don      False
Mike      True
Edwin     True
dtype: bool

In [12]:
# We can use these booleans
Sales[[False,True, True]]

Mike     650
Edwin    870
dtype: int64

In [13]:
Sales[Sales>500]

Mike     650
Edwin    870
dtype: int64

In [14]:
"Don" in Sales

True

In [15]:
# False example
"Sally" in Sales

False

In [16]:
450 in Sales

False

In [18]:
"450" in Sales
# 450 is not an index, it's a value. Thus it will return False.

False

# Working with Dictionaries

In [11]:
# Converting a Series to a dictionary
sales_dict = Sales.to_dict()
sales_dict

{'Don': 450, 'Mike': 650, 'Edwin': 870}

In [21]:
# Converting a dict to a Series
sales_ser = Series(sales_dict)
sales_ser

Don      450
Mike     650
Edwin    870
dtype: int64

### Adding entries and working with NaN/null values

In [22]:
new_sales = Series (Sales, index=["Don", "Mike", "Sally", "Edwin", "Lucy"])
new_sales

Don      450.0
Mike     650.0
Sally      NaN
Edwin    870.0
Lucy       NaN
dtype: float64

In [23]:
# We can check if there are any NaN values in a Series
# We use numpy
np.isnan(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

In [24]:
# To check for null values, use Pandas
pd.isnull(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

### Naming components in a Series

In [29]:
# Name an index
Sales.index.name = "Sales person"
Sales

Sales person
Don      450
Mike     650
Edwin    870
Name: Total TV sales, dtype: int64

In [28]:
# Naming a Series
Sales.name = "Total TV sales"
Sales

Sales person
Don      450
Mike     650
Edwin    870
Name: Total TV sales, dtype: int64

## DataFrames

DataFrames are two-dimensional, size-mutable, potentially heterogeneous tabular data structures. This data structure contains TWO labeled axes (rows and columns.)

### Creating a DataFrame

In [33]:
# Creating a DataFrame from a list
data = [["Adrian", 20],["Bethany", 23], ["Chloe", 41]]

#When we create a DataFrame, we can specify what the column names are and the data type is
df = pd.DataFrame(data, columns=["Name", "Age"])
df

Unnamed: 0,Name,Age
0,Adrian,20
1,Bethany,23
2,Chloe,41


# Python Pandas Research/Practice

In [27]:
# Creating DF from a dictionary

# Create a dictionary
dict = {
  'FirstName': ['John', 'Andrew', 'Maria', 'Helen'],
  'LastName': ['Brown', 'Purple', 'White', 'Blue'],
  'Age': [25, 48, 76, 19]
}

# Make it DF with DataFrame command
df = pd.DataFrame(dict)
df

Unnamed: 0,FirstName,LastName,Age
0,John,Brown,25
1,Andrew,Purple,48
2,Maria,White,76
3,Helen,Blue,19


In [24]:
# Adding a custom index

# First we can add a custom column to make index later (we could also use existing column)
CustomC = ['First', 'Second', 'Third', 'Fourth'] # Create a new column
df ['CustomColumn'] = CustomC # Append to existing DF as a column 'CustomColumn'

#Make the new column an index
df.set_index('CustomColumn', inplace=True)
df

Unnamed: 0_level_0,FirstName,LastName,Age
CustomColumn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,John,Brown,25
Second,Andrew,Purple,48
Third,Maria,White,76
Fourth,Helen,Blue,19


In [36]:
# Creating a DF from a list of dictionaries

# Using the same dictionary as above, only now it's a list of dictionaries
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]

# Make it DF with DataFrame command
df = pd.DataFrame(dict)

df

Unnamed: 0,FirstName,LastName,Age
0,John,Brown,25
1,Andrew,Purple,48
2,Maria,White,76
3,Helen,Blue,19


In [34]:
# Creating a DF from a Series

# Creating two lists first
name = ['Don', 'Mike', 'Sally', 'Edwin', 'Lucy']
age = [25, 48, 55, 34, 19]
# Creating two Series by passing lists
name_series = pd.Series(name)
age_series = pd.Series(age)
 
# Creating a dictionary by passing Series objects as values
dict = {'Author': name_series,
         'Article': age_series}
 
# Creating DataFrame by passing Dictionary
df = pd.DataFrame(dict)
df


Unnamed: 0,Author,Article
0,Don,25
1,Mike,48
2,Sally,55
3,Edwin,34
4,Lucy,19


In [53]:
# Adding a Series to an existing DF

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# Define a new Series, first as list, then Series
city = ['London', 'Manchester', 'Liverpool', 'Glasgow']
city_series = pd.Series(city)

# Append to existing DF as a column 'City'
df ['City'] = city_series

df


Unnamed: 0,FirstName,LastName,Age,City
0,John,Brown,25,London
1,Andrew,Purple,48,Manchester
2,Maria,White,76,Liverpool
3,Helen,Blue,19,Glasgow


In [60]:
# Shifting/Changing a DF's index

# df.shift shifts the index, axis defines is it a column or a row
print(df.shift(1, axis=0))


  FirstName LastName   Age        City
0      None     None   NaN        None
1      John    Brown  25.0      London
2    Andrew   Purple  48.0  Manchester
3     Maria    White  76.0   Liverpool


In [67]:
# Fill in missing data

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': np.nan, 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': np.nan, 'Age': 76},
    {'FirstName': 'Helen', 'LastName': np.nan, 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# Filling missing values with 0 with fillna command
df.fillna(0)

Unnamed: 0,FirstName,LastName,Age
0,John,0,25
1,Andrew,Purple,48
2,Maria,0,76
3,Helen,0,19


In [77]:
# Fill in missing data - backfill + bfill

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': np.nan, 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': np.nan, 'Age': 76},
    {'FirstName': 'Helen', 'LastName': np.nan, 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# Filling missing values with 0 with backfill command
#df.fillna(method='backfill')

# Future-proof

# Filling missing values with 0 with bfill
df.bfill()

Unnamed: 0,FirstName,LastName,Age
0,John,Purple,25
1,Andrew,Purple,48
2,Maria,,76
3,Helen,,19


In [76]:
# Fill in missing data - pad + ffill

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': np.nan, 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': np.nan, 'Age': 76},
    {'FirstName': 'Helen', 'LastName': np.nan, 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# Filling missing values with 0 with backfill command
#df.fillna(method='pad')

# Filling missing values with 0 with ffill
df.ffill()

Unnamed: 0,FirstName,LastName,Age
0,John,,25
1,Andrew,Purple,48
2,Maria,Purple,76
3,Helen,Purple,19


In [94]:
# What does .interpolate() do?
# Python Pandas interpolate() method is used to fill NaN values in the DataFrame or Series 
# using various interpolation techniques to fill the missing values rather than hard-coding the value.

# Create a dictionary
dict = dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': np.nan},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': np.nan},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

df['Age'] = df.col.replace(0, np.nan)

# to interpolate the missing values 
#d = df.infer_objects(copy=False)
df.interpolate(method ='linear', limit_direction ='forward')
print(df)

  FirstName LastName  Age
0      John    Brown    0
1    Andrew   Purple   48
2     Maria    White    0
3     Helen     Blue   19


  df.interpolate(method ='linear', limit_direction ='forward')


In [None]:
# How to drop values

In [100]:
# Drop rows or columns

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# To drop column Age
newdf = df.drop('Age', axis='columns')
newdf

# To drop row 0
newdf = df.drop(0, axis='index')
newdf

Unnamed: 0,FirstName,LastName,Age
1,Andrew,Purple,48
2,Maria,White,76
3,Helen,Blue,19


In [101]:
# Drop based on a threshold

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# define a condition
df = df[df.Age > 19]

df

Unnamed: 0,FirstName,LastName,Age
0,John,Brown,25
1,Andrew,Purple,48
2,Maria,White,76


In [103]:
# Drop based on an index

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# To drop row 2
newdf = df.drop(2, axis='index')
newdf

Unnamed: 0,FirstName,LastName,Age
0,John,Brown,25
1,Andrew,Purple,48
3,Helen,Blue,19


In [107]:
# Remove duplicate rows

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# Removing duplicates with drop_duplicates() function
newdf = df.drop_duplicates()
newdf

Unnamed: 0,FirstName,LastName,Age
0,John,Brown,25
1,Andrew,Purple,48
3,Maria,White,76
4,Helen,Blue,19


In [113]:
# Find duplicate rows

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# To show which rows/columns have duplicates (Boolan)
newdf = df.duplicated()
newdf

# If duplicate rows/columns are needed (data)
duplicates = df[df.duplicated()]
duplicates

Unnamed: 0,FirstName,LastName,Age
2,Andrew,Purple,48
5,Helen,Blue,19


In [121]:
# How to select an entire column

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# select only Age column
newdf = df['Age']
newdf

0    25
1    48
2    76
3    19
Name: Age, dtype: int64

In [128]:
# iloc, what is it and what can you do with it?
#The iloc property gets, or sets, the value(s) of the specified indexes.

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# Can be used to select a single row
newdf = df.iloc[[2]]
newdf

# Or multiple rows
newdf = df.iloc[[1,2]]
newdf

# A range or rows (note single brackets)
newdf = df.iloc[0:2]
newdf

# Can specify both rows and columns to select - first bracket defines rows, second columns
newdf = df.iloc[[0, 2], [0, 1]]
newdf

Unnamed: 0,FirstName,LastName
0,John,Brown
2,Maria,White


In [139]:
# What about loc?
# The loc property gets, or sets, the value(s) of the specified labels/indices.

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# To get row with an index 0
newdf = df.loc[[0]]
newdf

# To get rows with an indiced 1 and 2
newdf = df.loc[[1,2]]
newdf

# To get Age in row with an index 0
newdf = df.loc[[0],['Age']]
newdf

Unnamed: 0,FirstName,LastName,Age
1,Andrew,Purple,48
2,Maria,White,76


In [143]:
# How can you filter to select specific info?

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# define a condition
newdf = df[df.Age > 19]
newdf

# Using loc select only certain columns
newdf = df.loc[df.Age > 19, ['FirstName', 'LastName']]
newdf

Unnamed: 0,FirstName,LastName
0,John,Brown
1,Andrew,Purple
2,Maria,White


In [145]:
# How to sort?

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# Sorting by 'Age' in ascending order with sort_values()
sorted_df = df.sort_values(by='Age')
sorted_df

# Sorting by 'Age' in descending order with sort_values()
sorted_df = df.sort_values(by='Age', ascending = False)
sorted_df

Unnamed: 0,FirstName,LastName,Age
2,Maria,White,76
1,Andrew,Purple,48
0,John,Brown,25
3,Helen,Blue,19


In [147]:
# How to rank?

# Create a dictionary
dict = [
    {'FirstName': 'John', 'LastName': 'Brown', 'Age': 25},
    {'FirstName': 'Andrew', 'LastName': 'Purple', 'Age': 48},
    {'FirstName': 'Maria', 'LastName': 'White', 'Age': 76},
    {'FirstName': 'Helen', 'LastName': 'Blue', 'Age': 19}
]
# Make it DF with DataFrame command
df = pd.DataFrame(dict)

# Ranking by 'FirstName' in ascending order with rank()
ranked_df = df['FirstName'].rank()
ranked_df

0    3.0
1    1.0
2    4.0
3    2.0
Name: FirstName, dtype: float64