# Intermediate Data Visualization with Seaborn by Ninh Nguyen

In [None]:
# import all modules
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Read in the DataFrame
df = pd.read_csv('data/schoolimprovement2010grants.csv')
df


In [None]:
# Display pandas histogram
df['Award_Amount'].plot.hist()
plt.show()

# Clear out the pandas histogram
plt.clf()


In [None]:
# Display a Seaborn displot
sns.displot(df['Award_Amount'])
plt.show()

# Clear the displot
plt.clf()


In [None]:
# Create a displot
sns.displot(df['Award_Amount'],
             bins=20)

# Display a plot
plt.show()


In [None]:
# Create a displot of the Award Amount
sns.displot(df['Award_Amount'],
             kind='kde',
             rug=True,
             fill=True)

# Plot the results
plt.show()


In [None]:
insure = pd.read_csv('data/insurance_premiums.csv')
insure


In [None]:
# Create a regression plot of premiums vs. insurance_losses
sns.regplot(data=insure,
            x="insurance_losses",
            y="premiums")

# Display the plot
plt.show()


In [None]:
# Create an lmplot of premiums vs. insurance_losses
sns.lmplot(data=insure,
           x="insurance_losses",
           y="premiums")

# Display the second plot
plt.show()


In [None]:
# Create a regression plot using hue
sns.lmplot(data=insure,
           x="insurance_losses",
           y="premiums",
           hue="Region")

#Show the results
plt.show()


In [None]:
# Create a regression plot with multiple rows
sns.lmplot(data=insure,
           x="insurance_losses",
           y="premiums",
           row="Region")

# Show the plot
plt.show()


In [None]:
fmr = pd.read_csv('data/FY18_4050_FMRs.csv')
fmr


In [None]:
# Plot the pandas histogram
fmr['fmr_2'].plot.hist()
plt.show()
plt.clf()

# Set the default seaborn style
sns.set()

# Plot the pandas histogram again
fmr['fmr_2'].plot.hist()
plt.show()
plt.clf()


In [None]:
# Plot with a dark style 
sns.set_style('dark')
sns.displot(fmr['fmr_2'])
plt.show()

# Clear the figure
plt.clf()


In [None]:
# Plot with a whitegrid style
sns.set_style('whitegrid')
sns.displot(fmr['fmr_2'])
plt.show()

# Clear the figure
plt.clf()


In [None]:
# Set the style to white
sns.set_style('white')

# Create a regression plot
sns.lmplot(data=fmr,
           x='pop2010',
           y='fmr_2')

# Remove the spines
sns.despine()

# Show the plot and clear the figure
plt.show()
plt.clf()


In [None]:
# Set style, enable color code, and create a magenta displot
sns.set(color_codes=True)
sns.displot(fmr['fmr_3'], color='m')

# Show the plot
plt.show()


In [None]:
# Loop through differences between bright and colorblind palettes
for p in ['bright', 'colorblind']:
    sns.set_palette(p)
    sns.displot(fmr['fmr_3'])
    plt.show()
    
    # Clear the plots
    plt.clf()


In [None]:
# Create the Purples palette
sns.palplot(sns.color_palette("Purples", 8))
plt.show()


In [None]:
# Create the husl palette
sns.palplot(sns.color_palette("husl", 10))
plt.show()


In [None]:
# Create the coolwarm palette
sns.palplot(sns.color_palette("coolwarm", 6))
plt.show()


In [None]:
# Create a figure and axes
fig, ax = plt.subplots()

# Plot the distribution of data
sns.histplot(fmr['fmr_3'], ax=ax)

# Create a more descriptive x axis label
ax.set(xlabel="3 Bedroom Fair Market Rent")

# Show the plot
plt.show()


In [None]:
# Create a figure and axes
fig, ax = plt.subplots()

# Plot the distribution of 1 bedroom rents
sns.histplot(fmr['fmr_1'], ax=ax)

# Modify the properties of the plot
ax.set(xlabel="1 Bedroom Fair Market Rent",
       xlim=(100,1500),
       title="US Rent")

# Display the plot
plt.show()


In [None]:
# Create the stripplot
sns.stripplot(data=df,
              x='Award_Amount',
              y='Model Selected',
              jitter=True)

plt.show()


In [None]:
# Create and display a swarmplot with hue set to the Region
sns.swarmplot(data=df,
              x='Award_Amount',
              y='Model Selected',
              hue='Region')

plt.show()


In [None]:
# Create a boxplot
sns.boxplot(data=df,
            x='Award_Amount',
            y='Model Selected')

plt.show()
plt.clf()


In [None]:
# Create a violinplot with the husl palette
sns.violinplot(data=df,
               x='Award_Amount',
               y='Model Selected',
               palette='husl')

plt.show()
plt.clf()


In [None]:
# Create a boxenplot with the Paired palette and the Region column as the hue
sns.boxenplot(data=df,
           x='Award_Amount',
           y='Model Selected',
           palette='Paired',
           hue='Region')

plt.show()
plt.clf()


In [None]:
# Show a countplot with the number of models used with each region a different color
sns.countplot(data=df,
              y="Model Selected",
              hue="Region")

plt.show()
plt.clf()


In [None]:
# Create a pointplot and include the capsize in order to show caps on the error bars
sns.pointplot(data=df,
              y='Award_Amount',
              x='Model Selected',
              capsize=.1)

plt.show()
plt.clf()


In [None]:
# Create a barplot with each Region shown as a different color
sns.barplot(data=df,
            y='Award_Amount',
            x='Model Selected',
            hue='Region')

plt.show()
plt.clf()


In [None]:
college = pd.read_csv('data/college_datav3.csv')
college


In [None]:
# Display a regression plot for Tuition
sns.regplot(data=college,
            y='Tuition',
            x='SAT_AVG_ALL',
            marker='^',
            color='g')

plt.show()
plt.clf()


In [None]:
# Display the residual plot
sns.residplot(data=college,
              y='Tuition',
              x='SAT_AVG_ALL',
              color='g')

plt.show()
plt.clf()


In [None]:
# Plot a regression plot of Tuition and the Percentage of Pell Grants
sns.regplot(data=college,
            y='Tuition',
            x='PCTPELL')

plt.show()
plt.clf()


In [None]:
# Create another plot that estimates the tuition by region
sns.regplot(data=college,
            y='Tuition',
            x='PCTPELL',
            x_bins=5)

plt.show()
plt.clf()


In [None]:
# The final plot should include a line using a 2nd order polynomial
sns.regplot(data=college,
            y='Tuition',
            x='PCTPELL',
            x_bins=5,
            order=2)

plt.show()
plt.clf()


In [None]:
# Create a scatter plot by disabling the regression line
sns.regplot(data=college,
            y='Tuition',
            x='UG',
            fit_reg=False)

plt.show()
plt.clf()


In [None]:
# Create a scatter plot and bin the data into 5 bins
sns.regplot(data=college,
            y='Tuition',
            x='UG',
            x_bins=5)

plt.show()
plt.clf()


In [None]:
# Create a regplot and bin the data into 8 bins
sns.regplot(data=college,
            y='Tuition',
            x='UG',
            x_bins=8)

plt.show()
plt.clf()


In [None]:
guest = pd.read_csv('data/daily_show_guests_cleaned.csv')
guest


In [None]:
# Create a crosstab table of the data and print it
pd_crosstab = pd.crosstab(guest["Group"], guest["YEAR"])
print(pd_crosstab)

# Plot a heatmap of the table
sns.heatmap(pd_crosstab)

# Rotate tick marks for visibility
plt.yticks(rotation=0)
plt.xticks(rotation=90)

plt.show()


In [None]:
# Create the crosstab DataFrame
pd_crosstab = pd.crosstab(guest["Group"], guest["YEAR"])

# Plot a heatmap of the table
sns.heatmap(pd_crosstab, cbar=False, cmap="BuGn", linewidths=.3)

# Rotate tick marks for visibility
plt.yticks(rotation=0)
plt.xticks(rotation=90)

# Show the plot
plt.show()
plt.clf()


In [None]:
# Create a factor plot that contains boxplots of Tuition values
sns.catplot(data=college,
               x='Tuition',
               kind='box',
               row='Degree_Type')

plt.show()
plt.clf()


In [None]:
# Create a facetted pointplot of Average SAT scores facetted by Degree Type 
sns.catplot(data=college,
               x='SAT_AVG_ALL',
               kind='point',
               row='Degree_Type',
               row_order=['Graduate', 'Bachelors', 'Associates', 'Certificate'])

plt.show()
plt.clf()


In [None]:
# Create a PairGrid with a scatter plot for fatal_collisions and premiums
g = sns.PairGrid(insure, vars=["fatal_collisions", "premiums"])
g2 = g.map(sns.scatterplot)

plt.show()
plt.clf()


In [None]:
# Create the same PairGrid but map a histogram on the diag
g = sns.PairGrid(insure, vars=["fatal_collisions", "premiums"])
g2 = g.map_diag(sns.histplot)
g3 = g2.map_offdiag(sns.scatterplot)

plt.show()
plt.clf()


In [None]:
# Create a pairwise plot of the variables using a scatter plot
sns.pairplot(data=insure,
             vars=["fatal_collisions", "premiums"],
             kind='scatter')

plt.show()
plt.clf()


In [None]:
# Plot the same data but use a different color palette and color code by Region
sns.pairplot(data=insure,
             vars=["fatal_collisions", "premiums"],
             kind='scatter',
             hue='Region',
             palette='RdBu',
             diag_kws={'alpha':.5})

plt.show()
plt.clf()


In [None]:
# Build a pairplot with different x and y variables
sns.pairplot(data=insure,
             x_vars=["fatal_collisions_speeding", "fatal_collisions_alc"],
             y_vars=['premiums', 'insurance_losses'],
             kind='scatter',
             hue='Region',
             palette='husl')

plt.show()
plt.clf()


In [None]:
# plot relationships between insurance_losses and premiums
sns.pairplot(data=insure,
             vars=["insurance_losses", "premiums"],
             kind='reg',
             palette='BrBG',
             diag_kind = 'kde',
             hue='Region')

plt.show()
plt.clf()


In [None]:
bike = pd.read_csv('data/bike_share.csv')
bike


In [None]:
# Build a JointGrid comparing humidity and total_rentals
sns.set_style("whitegrid")
g = sns.JointGrid(x="hum",
                  y="total_rentals",
                  data=bike,
                  xlim=(0.1, 1.0))

g.plot(sns.regplot, sns.histplot)

plt.show()
plt.clf()


In [None]:
# Create a jointplot similar to the JointGrid
sns.jointplot(x="hum",
              y="total_rentals",
              kind='reg',
              data=bike)

plt.show()
plt.clf()


In [None]:
# Plot temp vs. total_rentals as a regression plot
sns.jointplot(x="temp",
              y="total_rentals",
              kind='reg',
              data=bike,
              order=2,
              xlim=(0, 1))

plt.show()
plt.clf()


In [None]:
# Plot a jointplot showing the residuals
sns.jointplot(x="temp",
              y="total_rentals",
              kind='resid',
              data=bike,
              order=2)

plt.show()
plt.clf()


In [None]:
# Create a jointplot of temp vs. casual riders
# Include a kdeplot over the scatter plot
g = sns.jointplot(x="temp",
                   y="casual",
                   kind='scatter',
                   data=bike,
                   marginal_kws=dict(bins=10))
g.plot_joint(sns.kdeplot)

plt.show()
plt.clf()


In [None]:
# Replicate the previous plot but only for registered riders
g = sns.jointplot(x="temp",
                   y="registered",
                   kind='scatter',
                   data=bike,
                   marginal_kws=dict(bins=10))
g.plot_joint(sns.kdeplot)

plt.show()
plt.clf()
