# Disaster Tweets Classifier

In [None]:
# installing all dependencies required for the notebook
%pip install -r requirements.txt

# Data Exploration & Pre-processing

## Data exploration

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/train.csv', index_col='id')
df.head()

In [None]:
df.shape # 7613 rows, with 4 columns

In [None]:
# target 1 refers to disaster tweet, 0 is not a disaster tweet
df['target'].value_counts()

In [None]:
# checking for completeness of data
print(f"{np.sum(df['keyword'].isna())} rows have no keywords")
print(f"{np.sum(df['location'].isna())} rows have no location")
print(f"{np.sum(df['text'].isna())} rows have no text")
print(f"{np.sum(df['text'].isna())} rows have no target")

In [None]:
# note that some keywords are phrases, with '%20' as a space
df['keyword'].value_counts() 

In [None]:
# note that there are some non-location locations, like 'World Wide!!' and 'a feminist, modernist hag.'
df['location'].value_counts() 

## Preprocessing

In [None]:
# download spaCy model for American English
!python3 -m spacy download en_core_web_sm

In [None]:
import spacy 
import en_core_web_sm
nlp = en_core_web_sm.load()

## Exploration: preprocessing a single tweet

In [None]:
# Let's try to explore and preprocess a single tweet first
s = df.loc[1,'text']
print(f"Original tweet: {s}")

# To lowercase
s = s.lower()

# Creating a doc with spaCy
doc = nlp(s)

# Let's look at the lemmas and is stopword of each token
print(f"Token\t\tLemma\t\tStopword")
print("="*40)
for token in doc:
    print(f"{token}\t\t{token.lemma_}\t\t{token.is_stop}")

features = set()
features |= set(doc)

# Constructing a bag of words for the tweet
freq = dict()
for word in features:
    freq[str(word)] = 0
for token in doc: 
    freq[str(token)] += 1
    
print(f"Bag of words for the tweet: {freq}")

## Preprocessing all data

In [None]:
preprocess_df = df #duplicate for preprocessing
features = set() #using set feature to contain all words seen

In [None]:
# create dataframe for bag of words representation
bow = pd.DataFrame()
bow['id'] = range(0, len(preprocess_df))
bow.set_index('id')

In [None]:
# to lower case
preprocess_df['text'] = preprocess_df.text.map(lambda tweet: tweet.lower()) 

# create documents
for i in range(0,1):
    doc = nlp(preprocess_df.iloc[i]['text'])
    # TO DO: build bag of words
    features |= (set(doc)) #union tokens and features

## Saving pre-processed data for collaborators

## Splitting into training and validation data

## Saving .csv files for training and validation sets