## Plan
### Turning tweets into features

- Start with trigrams, can tune later
- Can consider bigrams, bag of words, or other n-grams
- Ignore location information, at least for now
- Almost all tweets have keywords, use as another feature
- Make sure to process "keyword" values, removing special characters

### Criteria for disaster
- Meant to track if tweets are referring to ongoing disasters
- Also includes historical events


### Training
- Train and validate our model on `train.csv` 
- Test by sending results to Kaggle

### Random forest
- Use Gini criterion for efficiency

## Importing and vectorizing data

In [143]:
from sklearn.feature_extraction.text import CountVectorizer
import string
import numpy as np
import pandas as pd
import re

In [144]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [145]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer()

In [146]:
def standardize_string(s):
    s = s.lower()
    s = re.sub("http://t\.co/\S+", "", s)
    return s

In [147]:
print(standardize_string("http://t.co/"))

http://t.co/


In [148]:
all_characters = set()

for tweet in train_df['text']:
    all_characters = all_characters.union(set(standardize_string(tweet)))

char_list = list(all_characters)
char_list.sort()
print(char_list)

['\n', ' ', '!', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x89', '\x9d', '¡', '¢', '£', '¤', '¨', '©', 'ª', '«', '¬', '´', '¼', 'â', 'ã', 'å', 'ç', 'è', 'ê', 'ì', 'ï', 'ñ', 'ò', 'ó', '÷', 'û', 'ü']


In [151]:
included_chars = list(string.ascii_lowercase + string.digits) + [':', '\'', '#', '@', 'â', 'ã', 'å', 'ç', 'è', 'ê', 'ì', 'ï', 'ñ', 'ò', 'ó', 'û', 'ü', ' ', '/']
print(included_chars)

excluded_chars = list(set(char_list) - set(included_chars))
print(excluded_chars)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', "'", '#', '@', 'â', 'ã', 'å', 'ç', 'è', 'ê', 'ì', 'ï', 'ñ', 'ò', 'ó', 'û', 'ü', ' ', '/']
['´', '+', '?', '}', '¨', '|', '%', '÷', ')', '¢', '$', '©', '>', '(', ';', '«', '{', '£', '!', '¬', ',', '~', '.', '=', '¡', '\\', '\n', '&', '\x89', '¤', '[', ']', '`', '¼', 'ª', '-', '_', '^', '*', '\x9d']


### Stripping characters
- Try both with and without removing special characters
- Consider skipping data points with bad characters

In [134]:
def remove_special_characters(s):
    for c in char_list:
        if c not in included_chars:
            s = s.replace(c, "")
    return s

In [141]:
def tweet_to_array(t):
    t = standardize_string(t)
    tweet_array = t.split()
    return tweet_array

In [169]:
def tokenize_data():
    all_tweets = []
    for tweet in train_df['text']:
        tweet_array = tweet_to_array(tweet)
        invalid_fields = []
        for field in tweet_array:
            for c in excluded_chars:
                if c in field:
                    invalid_fields.append(field)
                    continue
        
        for f in invalid_fields:
            del f
        all_tweets.append(tweet_array)
        
    return all_tweets   

In [170]:
tokenize_data()

[['our',
  'deeds',
  'are',
  'the',
  'reason',
  'of',
  'this',
  '#earthquake',
  'may',
  'allah',
  'forgive',
  'us',
  'all'],
 ['forest', 'fire', 'near', 'la', 'ronge', 'sask.', 'canada'],
 ['all',
  'residents',
  'asked',
  'to',
  "'shelter",
  'in',
  "place'",
  'are',
  'being',
  'notified',
  'by',
  'officers.',
  'no',
  'other',
  'evacuation',
  'or',
  'shelter',
  'in',
  'place',
  'orders',
  'are',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#wildfires',
  'evacuation',
  'orders',
  'in',
  'california'],
 ['just',
  'got',
  'sent',
  'this',
  'photo',
  'from',
  'ruby',
  '#alaska',
  'as',
  'smoke',
  'from',
  '#wildfires',
  'pours',
  'into',
  'a',
  'school'],
 ['#rockyfire',
  'update',
  '=>',
  'california',
  'hwy.',
  '20',
  'closed',
  'in',
  'both',
  'directions',
  'due',
  'to',
  'lake',
  'county',
  'fire',
  '-',
  '#cafire',
  '#wildfires'],
 ['#flood',
  '#disaster',
  'heavy',
  'rain',
  'causes',
  'flash',
  'flood

In [166]:
print(train_df)

         id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
0     Our Deeds are the Reason of this #earthquake M...       1  
1                Forest fire near La Ronge Sask. Canada       1  
2     All residents asked to 'shelter in place' are ...       1  
3     13,000 people receive #wildfires evacuation or...       1  
4     Just got sent this photo from Ruby #Alaska as ...       1  
...                                                 ...     ...  
7608  Two giant cranes holding a bridge collapse int...       1  
7609  @aria_ahrary @TheTawniest The out of control w...       1  
7610  M1.94 [01:04 UT