# Twitter Disaster Tweets

#### Includes an F1 model evauluation

#### 1. Import libraries

In [1]:
# getting started

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import time

In [3]:
from sklearn.preprocessing import OneHotEncoder

#### 2. Import data into Pandas data frames

In [4]:
train = pd.read_csv('train.csv') # creates pandas data-frame objects from the train & test data
test = pd.read_csv('test.csv') 

#### 3. Look at the Data

In [5]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
#remove NaNs 

train = train.fillna('')

In [7]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


From the mean we can see that 43% of the tweets describe real disasters.

In [9]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [10]:
test.describe()

Unnamed: 0,id
count,3263.0
mean,5427.152927
std,3146.427221
min,0.0
25%,2683.0
50%,5500.0
75%,8176.0
max,10875.0


##### Try a model using just text...

#### 3. Set up X and Y arrays for Test and Train

In [11]:
X_train = train['text']

In [12]:
X_train.head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [13]:
X_train.shape

(7613,)

In [14]:
#Vectorize the train comments sample and convert to document term matrix
#vect = TfidfVectorizer()
#X_train_dtm = vect.fit_transform(X_train)

In [15]:
vectorizer = CountVectorizer()

In [16]:
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [17]:
X_train_vec = vectorizer.transform(X_train)

In [18]:
X_train_vec.shape

(7613, 21637)

In [19]:
X_train_vec

<7613x21637 sparse matrix of type '<class 'numpy.int64'>'
	with 111497 stored elements in Compressed Sparse Row format>

#### 4. Import, Initialise and Fit Model

In [20]:
#target columns
target_y = train['target']

In [21]:
target_y.shape

(7613,)

In [22]:
model = LogisticRegression()

##### Split into train and validation set to fit model and evaluate performance

In [23]:
start_time = time.time()
X_train, X_valid, y_train, y_valid = train_test_split(X_train_vec, target_y, test_size=0.33, random_state=2018)

train_f1 = []
valid_f1 = []

In [24]:
#preds_train = np.zeros((X_train.shape[0], 1))
#preds_valid = np.zeros((X_valid.shape[0], 1))

In [25]:
model.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
#predict method predicts class labels rather than probability of each class label


preds_train = model.predict(X_train)
preds_valid = model.predict(X_valid)

In [27]:
#no need to round up if using model.predict rather than model.predict_proba

#preds_train_int = np.rint(preds_train)
#preds_valid_int = np.rint(preds_valid)

In [28]:
train_f1_class = f1_score(y_train,preds_train)
valid_f1_class = f1_score(y_valid,preds_valid)

In [29]:
y_train

3891    1
3989    1
5025    0
3122    0
810     0
       ..
917     0
2825    0
6370    1
2662    1
1274    1
Name: target, Length: 5100, dtype: int64

In [31]:
preds_train

array([0, 1, 0, ..., 1, 1, 1], dtype=int64)

In [34]:
train_f1.append(train_f1_class)
valid_f1.append(valid_f1_class)
print('mean column-wise log loss:Train dataset', np.mean(train_f1))
print('mean column-wise log loss:Validation dataset', np.mean(valid_f1))

mean column-wise log loss:Train dataset 0.9708920187793427
mean column-wise log loss:Validation dataset 0.740521910388971


In [35]:
print('Class:= Real or Not')
print('Train f1:', train_f1_class)
print('Valid f1:', valid_f1_class)

Class:= Real or Not
Train f1: 0.9708920187793427
Valid f1: 0.740521910388971


In [36]:
end_time=time.time()
print("total time for model",end_time-start_time)

total time for model 197.0577256679535
