In [14]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
data = pd.read_csv('twitter_test.csv')
data["emergency"] = np.nan
data.drop(columns=['id'],inplace=True)
data.head()

Unnamed: 0,keyword,location,text,emergency
0,,,Just happened a terrible car crash,
1,,,"Heard about #earthquake is different cities, s...",
2,,,"there is a forest fire at spot pond, geese are...",
3,,,Apocalypse lighting. #Spokane #wildfires,
4,,,Typhoon Soudelor kills 28 in China and Taiwan,


## Clean test set

In [16]:
import string
import re
import nltk
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')

In [17]:
## functions for cleaning tasks

def remove_punct(text):
    no_punct = ''.join(char for char in text if char not in string.punctuation)
    return no_punct

# creates a list of words
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

# remove common words with no meaning e.g. connectors
def remove_stopwords(token_list):
    text = [word for word in token_list if word not in stopwords]
    return text

wn = nltk.WordNetLemmatizer()

# convert words into their root forms
def lemmatize(text):
    lemmatized_text = [wn.lemmatize(word) for word in text]
    return lemmatized_text

In [18]:
data['clean_text'] = data['text'].apply(lambda x: remove_punct(x))
data['tokenized'] = data['clean_text'].apply(lambda x: tokenize(x.lower()))
data['no_stopwords'] = data['tokenized'].apply(lambda x: remove_stopwords(x))
data['lemmatized'] = data['no_stopwords'].apply(lambda x: lemmatize(x))

data.head()

Unnamed: 0,keyword,location,text,emergency,clean_text,tokenized,no_stopwords,lemmatized
0,,,Just happened a terrible car crash,,Just happened a terrible car crash,"[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[happened, terrible, car, crash]"
1,,,"Heard about #earthquake is different cities, s...",,Heard about earthquake is different cities sta...,"[heard, about, earthquake, is, different, citi...","[heard, earthquake, different, cities, stay, s...","[heard, earthquake, different, city, stay, saf..."
2,,,"there is a forest fire at spot pond, geese are...",,there is a forest fire at spot pond geese are ...,"[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr...","[forest, fire, spot, pond, goose, fleeing, acr..."
3,,,Apocalypse lighting. #Spokane #wildfires,,Apocalypse lighting Spokane wildfires,"[apocalypse, lighting, spokane, wildfires]","[apocalypse, lighting, spokane, wildfires]","[apocalypse, lighting, spokane, wildfire]"
4,,,Typhoon Soudelor kills 28 in China and Taiwan,,Typhoon Soudelor kills 28 in China and Taiwan,"[typhoon, soudelor, kills, 28, in, china, and,...","[typhoon, soudelor, kills, 28, china, taiwan]","[typhoon, soudelor, kill, 28, china, taiwan]"


## Vectorization

In [19]:
train_data = pd.read_csv('clean.csv')
train_data.head()

Unnamed: 0.1,Unnamed: 0,keyword,location,text,emergency,clean_text,tokenized,no_stopwords,lemmatized,body_len,sentiment
0,0,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"['our', 'deeds', 'are', 'the', 'reason', 'of',...","['deeds', 'reason', 'earthquake', 'may', 'alla...","['deed', 'reason', 'earthquake', 'may', 'allah...",57,0.2732
1,1,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"['forest', 'fire', 'near', 'la', 'ronge', 'sas...","['forest', 'fire', 'near', 'la', 'ronge', 'sas...","['forest', 'fire', 'near', 'la', 'ronge', 'sas...",32,-0.34
2,2,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"['all', 'residents', 'asked', 'to', 'shelter',...","['residents', 'asked', 'shelter', 'place', 'no...","['resident', 'asked', 'shelter', 'place', 'not...",112,-0.296
3,3,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"['13000', 'people', 'receive', 'wildfires', 'e...","['13000', 'people', 'receive', 'wildfires', 'e...","['13000', 'people', 'receive', 'wildfire', 'ev...",57,0.0
4,4,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"['just', 'got', 'sent', 'this', 'photo', 'from...","['got', 'sent', 'photo', 'ruby', 'alaska', 'sm...","['got', 'sent', 'photo', 'ruby', 'alaska', 'sm...",72,0.0


In [20]:
#for tf-idf
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit(train_data['lemmatized'])

In [22]:
X_tfidf = X_tfidf.transform(data['text'])

## Load best model

In [None]:
import pickle

with open('RF_Model.pkl', 'rb') as file:  
    model = pickle.load(file)

model

In [None]:
y_pred = model.predict(X_tfidf_feat)
print(y_pred)

In [None]:
data['emergency'] = y_pred
data.head()

In [None]:
data.to_csv('emergency_predictions.csv')