In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from math import ceil


Bad key "text.kerning_factor" on line 4 in
c:\users\lauri_almadeartista\miniconda3\envs\cld-tp2\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
def plotRegRes(df, var, target, ordr=1):
    '''
    Given a dataframe df, a feature name (str) var and the target name (str),
    returns regression and residualplot for var vs. target feature in df.
    Optional you can change the order of the regression. By default is 1
    '''
    regtitle = 'Regression Plot of ' + var + ' vs. ' + target
    restitle = 'Residual Plot of ' + var + ' vs. ' + target
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5))
    sns.regplot(data=df, x=var, y=target, ax=ax1, order=ordr)
    ax1.set_title(regtitle)
    sns.residplot(data=df, x=var, y=target, ax=ax2, order=ordr)
    ax2.set_title(restitle)    
    return fig


def plotCatBox(df, var, target):
    '''
    Given a dataframe df, a categorical feature name (str) var and the target name (str),
    returns a plot of boxplots by category against target sorted by median.
    '''
    title = 'Distribution of ' + target + ' by category of ' + var
    sorted_ft = df.groupby([var])[target].median().sort_values()
    width = len(list(sorted_ft.index)) * 2
    plt.figure(figsize=(width,5))
    ax = sns.boxplot(data=df, x=var, y=target, order=list(sorted_ft.index))
    ax.set_title(title)
    ax.tick_params('x',labelrotation=90)    
    return ax

In [3]:
def plotNumBoxList(df, numFt):
    '''
    Given a dataframe df and a list of numerical features
    returns a figure composed of boxplots of each feature
    '''
    rows = ceil(len(numFt)/5)
    h = 5 * rows
    fig, axs = plt.subplots(rows, 5, figsize=(18,h))
    for ft, ax in zip(numFt,axs.flatten()):
        df[ft].plot(kind='box', ax=ax)
        ax.set_title(ft)
    return fig

In [4]:
def plotBarCatList(df, catFt):
    '''
    Given a dataframe df and a list of categorical features
    returns a figure composed of countplots of each categorical feature
    '''
    rows = ceil(len(catFt)/4)
    h = 5 * rows
    fig_cat, axs_cat = plt.subplots(rows, 4, figsize=(20,h))
 
    for ft, ax in zip(catFt,axs_cat.flatten()):
        pct_labels = round(100 * df[ft].value_counts(ascending=True) / df.shape[0], 2).astype(str) + '%'
        rects = df[ft].value_counts(ascending=True).plot(kind='barh', ax=ax)
        for rect, lab in zip(rects.patches, pct_labels):
            width = int(rect.get_width())
            if width > 200:
                xloc = -5
                clr = 'white'
                align = 'right'
            else:
                xloc = 5
                clr = 'black'
                align = 'left'
            yloc = rect.get_y() + rect.get_height() / 2
            label = ax.annotate(lab, xy=(width, yloc), xytext=(xloc, 0),
                                textcoords="offset points",
                                ha=align, va='center',
                                color=clr, weight='bold', clip_on=True)
        ax.set_title(ft)
        ax.tick_params('x',labelrotation=90)
    return fig_cat

In [5]:
def pltHist(df, var, bins, ylabel, xlabel):
    '''
    Given a dataframe df a numerical feature var, the number of bins and the desired
    x and y lables, plots a histogram of that var.
    '''
    count, binEdges = np.histogram(df[var], bins=bins)
    ax = df[var].plot(kind='hist', xticks=binEdges, rot=90, figsize=(15,5), bins=bins)
    plt.title('Histogram of %s' %(var))
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    return ax