In [1]:
import pandas as pd
from sqlalchemy import create_engine

import plotly
import plotly.graph_objs as go

import json

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
database_filepath = 'DisasterResponse.db'

In [3]:
# Create the engine
engine = create_engine(f"sqlite:///{database_filepath}")
# Load the data in a pandas Dataframe
df = pd.read_sql_table("disaster_messages", engine)

In [4]:
def get_graph_json(graph):
    return json.dumps(graph, cls=plotly.utils.PlotlyJSONEncoder)

# Example already in script

In [5]:
genre_counts = df.groupby('genre').count()['message']
genre_names = list(genre_counts.index)

genre_counts_graph = {
    'data': [
        go.Bar(
            x=genre_names,
            y=genre_counts
        )
    ],

    'layout': {
        'title': {
            'text': 'Distribution of Message Genres'
        },
        'yaxis': {
            'title': {
                'text': "Count"
            }
        },
        'xaxis': {
            'title': {
                'text': "Genre"
            }
        }
    }
}

plotly.io.from_json( get_graph_json(genre_counts_graph) )

# Counts per Message Category

In [6]:
msg_categories = df.columns[4:]

msg_category_cols = df.columns[4:]
cat_count = df[msg_category_cols].sum().sort_values(ascending=False).head(10)
msg_category_display = [ cn.replace("_", " ").capitalize() for cn in cat_count.index ]

cat_counts_graph = {
    'data': [
        go.Bar(
            x=msg_category_display,
            y=cat_count
        )
    ],

    'layout': {
        'title': {
            'text': 'Count of Messages by Category'
        },
        'yaxis': {
            'title': {
                'text': "Count"
            }
        },
        'xaxis': {
            'title': {
                'text': "Message Category"
            }
        }
    }
}

plotly.io.from_json( get_graph_json(cat_counts_graph) )

# Counts of translated message - Bar

In [7]:
# df['diff_lang'] = df['message'] != df['original']
df['diff_lang'] = df.apply( lambda row: 'Translated' if row['message'] != row['original'] else "English", axis=1 )
diff_lang_count = df.groupby('diff_lang').count()['message'].reset_index().rename(columns={'message': 'count'})

display(diff_lang_count)

diff_lang_graph = {
    'data': [
        go.Bar(
            x=diff_lang_count['diff_lang'],
            y=diff_lang_count['count']
        )
    ],

    'layout': {
        'title': {
            'text': 'Count of Messages Not in English'
        },
        'yaxis': {
            'title': {
                'text': "Count"
            }
        },
        'xaxis': {
            'title': {
                'text': "Translation"
            }
        }
    }
}

plotly.io.from_json( get_graph_json(diff_lang_graph) )

Unnamed: 0,diff_lang,count
0,English,27
1,Translated,26188


# Count of translated messages - Pie

In [8]:
df['diff_lang'] = df.apply( lambda row: 'Translated' if row['message'] != row['original'] else "English", axis=1 )
diff_lang_count = df.groupby('diff_lang').count()['message'].reset_index().rename(columns={'message': 'count'})

diff_lang_graph = {
    'data': [
        go.Pie(
            labels=diff_lang_count['diff_lang'], values=diff_lang_count['count']
        )
    ],

    'layout': {
        'title': {
            'text': 'Count of Messages Not in English'
        }
    }
}

plotly.io.from_json( get_graph_json(diff_lang_graph) )

# Counts of words per message - Pie chart

In [9]:
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [10]:
word_count = df['message'].apply( lambda msg: len(tokenize(msg)) ).rename('word_count').reset_index(drop=True)
word_count_bins = pd.cut(word_count, bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, float('inf')], labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100', '100+'])
count_per_bin = word_count_bins.value_counts().sort_index().reset_index().rename(columns={'word_count': 'bins'})
display( count_per_bin )

word_count_graph = {
    'data': [
        go.Pie(
            labels=count_per_bin['bins'], values=count_per_bin['count'], sort=False
        )
    ],

    'layout': {
        'title': {
            'text': 'Counts of Message Lengths'
        }
    }
}

plotly.io.from_json( get_graph_json(word_count_graph) )

Unnamed: 0,bins,count
0,0-10,2918
1,10-20,7916
2,20-30,7328
3,30-40,4689
4,40-50,2078
5,50-60,700
6,60-70,255
7,70-80,115
8,80-90,56
9,90-100,23


# Counts of words per message - Histogram

In [19]:
import plotly.figure_factory as ff

word_count = df['message'].apply( lambda msg: len(tokenize(msg)) ).rename('word_count').reset_index(drop=True)
word_count_u100 = word_count[word_count < 100]
word_count_o100 = word_count[word_count >= 100]

fig1 = ff.create_distplot([word_count_u100], ['Word Count (under 100 words)'], bin_size=1)
fig2 = ff.create_distplot([word_count_o100], ['Word Count (over 100 words)'], bin_size=1)
fig1.show()
fig2.show()

# Message Categories by Count by Word Count

In [None]:
# fig = px.scatter(df, x="word_count", y="count",
# 	         size="Category", color="continent",
#                  hover_name="country", log_x=True, size_max=60)

# fig = px.scatter(df, x="category", y="count",
# 	         size="word_count", color="continent",
#                  hover_name="country", log_x=True, size_max=60)