In [None]:
import sys
import pandas as pd
from sqlalchemy import create_engine

def load_data(messages_filepath, categories_filepath):
    """
    Load messages and categories data from filepaths and merge them into a single dataframe.
    
    Args:
        messages_filepath (str): Filepath of the messages data.
        categories_filepath (str): Filepath of the categories data.
    
    Returns:
        df (pandas.DataFrame): Merged dataframe of messages and categories.
    """
    messages = pd.read_csv(messages_filepath)
    categories = pd.read_csv(categories_filepath)
    df = pd.merge(messages, categories, on='id')
    return df

def clean_data(df):
    """
    Clean the merged dataframe by splitting the categories column into separate, clearly named columns,
    converting values to binary, and dropping duplicates.
    
    Args:
        df (pandas.DataFrame): Merged dataframe of messages and categories.
    
    Returns:
        df (pandas.DataFrame): Cleaned dataframe.
    """
    categories = df['categories'].str.split(';', expand=True)
    row = categories.iloc[0]
    category_colnames = row.apply(lambda x: x[:-2])
    categories.columns = category_colnames
    
    for column in categories:
        categories[column] = categories[column].str[-1]
        categories[column] = categories[column].astype(int)
    
    df = df.drop('categories', axis=1)
    df = pd.concat([df, categories], axis=1)
    df = df.drop_duplicates()
    
    return df



In [2]:
import numpy as np
import pandas as pd


In [11]:
messages = pd.read_csv("disaster_messages.csv")
categories = pd.read_csv("disaster_categories.csv")
df = pd.merge(messages, categories, on='id')
print(df.columns)

Index(['id', 'message', 'original', 'genre', 'categories'], dtype='object')


In [8]:
categories = df['categories'].str.split(';', expand=True)
row = categories.iloc[0]
category_colnames = row.apply(lambda x: x[:-2])
categories.columns = category_colnames

for column in categories:
    categories[column] = categories[column].str[-1]
    categories[column] = categories[column].astype(int)

# Change the value of 2 to 1 in the "related" column
categories['related'] = categories['related'].replace(2, 1)

df = df.drop('categories', axis=1)
df = pd.concat([df, categories], axis=1)
df = df.drop_duplicates()

print(df.columns)

Index(['id', 'message', 'original', 'genre', 'related', 'request', 'offer',
       'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'child_alone', 'water', 'food', 'shelter',
       'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')


In [9]:
df.related.unique()

array([1, 0])

In [12]:
# method 2
categories = df['categories'].str.split(';', expand=True)
row = categories.iloc[0]
category_colnames = row.apply(lambda x: x[:-2])
categories.columns = category_colnames

# for column in categories:
#     categories[column] = categories[column].str[-1]
#     categories[column] = categories[column].astype(int)

# # Change the value of 2 to 1 in the "related" column
# categories['related'] = categories['related'].replace(2, 1)

for column in categories:
    # Convert non-binary values to binary (1)
    categories[column] = categories[column].apply(lambda x: 1 if int(x[-1]) > 0 else 0)

df = df.drop('categories', axis=1)
df = pd.concat([df, categories], axis=1)
df = df.drop_duplicates()

print(df.columns)



Index(['id', 'message', 'original', 'genre', 'related', 'request', 'offer',
       'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'child_alone', 'water', 'food', 'shelter',
       'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')


In [13]:
df.related.unique()

array([1, 0], dtype=int64)