In [None]:
# import libraries
import sys
import pandas as pd
from sqlalchemy import create_engine
import pickle
import cleaner
import augmenter
import spellbuilder
from spellhelper import Spellhelper

In [None]:
def load_data(messages_filepath, categories_filepath):
    '''
    Input message and category data file paths
    return merged dataframe of the two files
    '''
    # load
    df_mess = pd.read_csv(messages_filepath)
    df_cat = pd.read_csv(categories_filepath)

    # merge
    df = df_mess[['id','message']].merge(df_cat, left_on='id', right_on='id')

    return df

In [None]:
# load
messages_filepath = 'messages.csv'
categories_filepath = 'categories.csv'
print('loading...')
df = load_data(messages_filepath, categories_filepath)

In [None]:
def format_data(df):
    '''
    input merged dataframe
    determine categorys
    format category columns
    remove duplicates
    drop extraneous class
    return formatted df
    '''
    # category column names
    row = df.categories[0]
    category_colnames = [s[:-2] for s in row.split(';')]

    # format category columns
    df_form = pd.DataFrame()
    for i, nrow in df.iterrows():
        df_form[nrow['id']] = [int(s[-1]) for s in df.categories[i].split(';')]
    df_form = df_form.transpose().reset_index().rename(columns={'index':'id'})
    df_form.columns = ['id'] + category_colnames
    df = df.merge(df_form, left_on='id', right_on='id')
    df = df.drop(['categories'], axis=1)

    # remove duplicates
    df = df.drop_duplicates()

    # drop rows with NaN's for emergent cases
    df.dropna(inplace=True)

    # drop related category 2's
    df.drop(df[df['related']==2].index, inplace = True)

    return df

In [None]:
# format
print('formatting...')
df = format_data(df)

In [None]:
# spell utility build
print('spell_corrections...')
fd_file, lu_file = spellbuilder.BuildFiles(df)

In [None]:
fd_file = 'freq_dict.txt'
lu_file = 'lookup_dict.pkl'

In [None]:
# clean and simulate
print('cleaning and simulating...')
augmenter.cleaner.speller = Spellhelper(fd_file)
with open(lu_file, 'rb') as handle:
    augmenter.cleaner.corr_dict = pickle.load(handle)
dfc, dfv, dft, dfa = augmenter.simulate(df)

In [None]:
def Truncate(text, length=501):
    '''
    input string and trim length
    strip and rejoin
    trim to nearest word if too long
    return truncated cleaned string
    '''
    strip = text.rstrip()
    if len(strip) < length:
        clean = ' '.join(strip.split())
                         
    else:                
        tokens = strip[:length + 1].split()
        clean = ' '.join(tokens[0:-1])
                         
    return clean

In [None]:
def save_data(df, database_filename):
    '''
    input cleaned dataframe and database name
    write cleaned dataframe to database table named MessCatRaw
    return none
    '''
    db_path = 'sqlite:///' + database_filename
    engine = create_engine(db_path)
    df.to_sql('MessCatRaw', engine, if_exists='replace', index=False)

In [None]:
print('finishing...')
# join
df_all = pd.concat([dfc, dfa], axis = 0)
# add validation and simulation flags
df_all['val'] = df_all.index.isin(dfv.index)
df_all['sim'] = df_all.index.isin(dfa.index)
# trim
df_all['message'] = df_all['message'].apply(Truncate)
df_all.drop(df_all[df_all['message'].str.len()<=27].index, inplace = True)
# save
database_filepath = 'DisasterResponse.db'
save_data(df_all, database_filepath)  
print('cleaned data saved to database!')

In [None]:
import process_data

In [None]:
%run process_data messages.csv categories.csv DisasterResponse.db