# Get started with pandas



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

### Import data

In [None]:
df = pd.read_csv("netflix_titles.csv")

### Basic data exploration

In [None]:
# first 10 rows
df.head(10)

# last 10 rows
# df.tail(10)

# see what the columns are
# df.columns

# check data types for each column
# df.dtypes

# see basic stats (for number columns)
# df.describe()

### Cleaning

In [None]:
# check for null values
# df.isnull().values.any()

# check for NaN values
# df.isna()
df.isna().sum()	

In [None]:
# create a new df with all rows that have NaN values for director removed
df_cleaned = df.dropna(subset=["director"])

# check for NaN values again - director should have 0 now
df_cleaned.isna().sum()	

### Sorting

In [None]:
# create a new df with values sorted in a certain order 

# sort by director, then by year
# director will be a-z and year will be largest to smallest
# put NaN values at the end
df_sorted = df.sort_values(by=["director", "release_year"], ascending=[True, False], na_position="last")
df_sorted

### Grouping

In [None]:
# create a new df that has rows grouped by director and a count of how many that director has
df_director_count = df.groupby("director").size().sort_values(ascending=False)
df_director_count

In [None]:
# create a new df that only includes entries with "TV Dramas" included in the listed_in column
df_tv_dramas = df[df["listed_in"].str.contains("TV Dramas", na=False)]
df_tv_dramas

### Basic plots with matplotlib

In [None]:
# create a basic pie chart with values from the "type" column

# get the data
type_counts = df["type"].value_counts()

# setup the chart
plt.figure(figsize=(4,4))  # figure size
plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%')
plt.title("Distribution of Netflix Movies & TV Shows")

# show the chart
plt.show()