In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Create dataframe
df = pd.DataFrame({
    "Name": ["Braund, Mr. Owen Harris", "Allen, Mr. William Henry", "Bonnell, Miss. Elizabeth"],
    "Age": [22, 35, 58],
    "Points": [10, 11, 15],
    "Sex": ["male", "male", "female"]}
)
df

In [None]:
# Extract column data
# Each column in a DataFrame is a Series
df["Age"]


In [None]:
# Create a series
ages = pd.Series([22, 35, 58], name="Age")
ages

In [None]:
# Calculate max age
df["Age"].max()

In [None]:
# Get dataframe overview
df.describe()

In [None]:
# Read CSV into dataframe
titanic = pd.read_csv("data/titanic.csv")
titanic

In [None]:
# Read first rows
titanic.head(8)

In [None]:
# Show data types
titanic.dtypes

In [None]:
# Write/read data to/from Excel
titanic.to_excel('data/titanic.xlsx', sheet_name='passengers', index=False)
titanic = pd.read_excel('data/titanic.xlsx', sheet_name='passengers')
titanic.tail(10)

In [None]:
# Info on dataframe
titanic.info()

In [None]:
# Extract column (a Series)
ages = titanic["Age"]
ages.head()

In [None]:
# Extract two columns
age_sex = titanic[["Age", "Sex"]]
age_sex

In [None]:
# Apply filter
above_35 = titanic[titanic["Age"] > 35]
above_35

In [None]:
# Show filter condition
titanic["Age"] > 35

In [None]:
# Show dataframe shape
above_35.shape

In [None]:
# Filter with isin
class_23 = titanic[titanic["Pclass"].isin([2, 3])]
class_23

In [None]:
# Or filter
class_23 = titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]
class_23

In [None]:
# Filter out null values
age_no_na = titanic[titanic["Age"].notna()]
age_no_na.head()

In [None]:
# Select rows and columns
adult_names = titanic.loc[titanic["Age"] > 35, "Name"]
adult_names

In [None]:
# Select rows by index
titanic.loc[[0, 1], ["Name", "Age"]]

In [None]:
# Select range of rows and columns
titanic.iloc[9:25, 2:5]


In [None]:
# Change rows and columns
titanic.iloc[0:3, 3] = "anonymous"
titanic.head()

In [None]:
# Read CSV into dataframe
air_quality = pd.read_csv("data/air_quality_no2.csv", index_col=0, parse_dates=True)
air_quality.head()

In [None]:
# Show graph
air_quality.plot()

In [None]:
# Show Paris data
air_quality["station_paris"].plot()

In [None]:
# Show Paris and London data scatter
air_quality.plot.scatter(x="station_london", y="station_paris", alpha=0.5)

In [None]:
# Show plot methods
[method_name for method_name in dir(air_quality.plot) if not method_name.startswith("_")]

In [None]:
# Boxplot
air_quality.plot.box()

In [None]:
# Subplots data over time
axs = air_quality.plot.area(figsize=(12, 4), subplots=True)

In [None]:
# All in one subplots
fig, axs = plt.subplots(figsize=(12, 4))        # Create an empty matplotlib Figure and Axes
air_quality.plot.area(ax=axs)                   # Use pandas to put the area plot on the prepared Figure/Axes
axs.set_ylabel("NO$_2$ concentration")          # Do any matplotlib customization you like
fig.savefig("data/no2_concentrations.png")      # Save the Figure/Axes using the existing matplotlib method.

In [None]:
# Add colums
air_quality["london_mg_per_cubic"] = air_quality["station_london"] * 1.882
air_quality["ratio_paris_antwerp"] = air_quality["station_paris"] / air_quality["station_antwerp"]
air_quality.head()

In [None]:
# Rename columns
air_quality_renamed = air_quality.rename(
     columns={"station_antwerp": "BETR801",
              "station_paris": "FR04014",
              "station_london": "London Westminster"})
air_quality_renamed.head()

In [None]:
# Change column name case
air_quality_renamed = air_quality_renamed.rename(columns=str.lower)
air_quality_renamed.head()

In [None]:
# Calculate mean
titanic = pd.read_csv("data/titanic.csv")
titanic["Age"].mean()

In [None]:
# Calculate median
titanic[["Age", "Fare"]].median()

In [None]:
# Aggregate stats
titanic.agg({"Age":["min", "max", "median", "skew"], "Fare":["min", "max", "median", "mean"]})

In [None]:
# Mean of columns
titanic.groupby("Sex").mean()


In [None]:
# Group stats
titanic[["Sex", "Age"]].groupby("Sex").mean()

In [None]:
# Select column on group data
titanic.groupby("Sex")["Age"].mean()

In [None]:
# Mean ticket fare price for each of the sex and cabin class combinations
titanic.groupby(["Sex", "Pclass"])["Fare"].mean()

In [None]:
# Mean ticket fare price for each of the sex and cabin class combinations
titanic[["Sex", "Pclass", "Fare"]].groupby(["Sex", "Pclass"]).mean()

In [None]:
# Counts values
titanic["Pclass"].value_counts()

In [None]:
# Counts values on group
titanic.groupby("Pclass")["Pclass"].count()

In [None]:
# Sort dataset
titanic.sort_values(by="Age").head()


In [None]:
# Sort by columns descending
titanic.sort_values(by=["Pclass", "Age"], ascending=False).head()


In [None]:
# Read CSV into dataframe
air_quality = pd.read_csv("data/air_quality_long.csv", index_col="date.utc", parse_dates=True)
no2 = air_quality[air_quality["parameter"] == "no2"]
no2

In [None]:
# Data subset
no2_subset = no2.sort_index().groupby("location").head(2)
no2_subset