In [54]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import string 

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline


%matplotlib inline
pd.options.display.max_columns = None

In [25]:
test_raw = pd.read_csv("../data/disaster-tweets/test.csv")
train_raw = pd.read_csv("../data/disaster-tweets/train.csv")

## Data Exploration

In [26]:
train_raw.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [27]:
# We have almost 1000 more 
pd.value_counts(train_raw['target'])

0    4342
1    3271
Name: target, dtype: int64

In [28]:
# From the comments I found that tweets are not unique
# And that sometimes they are mislabeled
print("Total Train Tweets:", len(train_raw['text']))
print("Total Unique Tweets:", len(train_raw['text'].unique()))

duped_tweets = train_raw[train_raw.duplicated(subset=['text'],keep=False)].sort_values(by=['text'])
duped_tweets.head()

train_raw = train_raw[~train_raw.duplicated(subset=['text','target'],keep=False)]
print("Total sans dupes: ", len(train_raw))

Total Train Tweets: 7613
Total Unique Tweets: 7503
Total sans dupes:  7456


## Feature Engineering

In [29]:
test_df = test_raw.copy()
train_df = train_raw.copy()

In [30]:
# url_count number of urls in text
# mean_word_length average character count in words
# char_count number of characters in text
# punctuation_count number of punctuations in text

In [31]:
# word count
def count_words(text):
    return len(text.split())

test_df['word_count'] = test_df.apply(lambda row: count_words(row['text']), axis=1)
train_df['word_count'] = train_raw.apply(lambda row: count_words(row['text']), axis=1)

In [32]:
# unique word count
def count_unique_words(text):
    return len(set(text.split()))

test_df['unique_count'] = test_df.apply(lambda row: count_unique_words(row['text']), axis=1)
train_df['unique_count'] = train_raw.apply(lambda row: count_unique_words(row['text']), axis=1)

In [33]:
# stop word count
def count_stop_words(text):
    return len([word for word in text.lower().split() if word in STOP_WORDS])

test_df['stop_count'] = test_df.apply(lambda row: count_stop_words(row['text']), axis=1)
train_df['stop_count'] = train_raw.apply(lambda row: count_stop_words(row['text']), axis=1)

In [34]:
# hashtag_count number of hashtags (#) in text
def count_hashtags(text):
    count = 0
    for c in text:
        count = count + 1 if c == "#" else count
    return count

test_df['hash_count'] = test_df.apply(lambda row: count_hashtags(row['text']), axis=1)
train_df['hash_count'] = train_raw.apply(lambda row: count_hashtags(row['text']), axis=1)

In [35]:
# mention_count number of mentions (@) in text
def count_mentions(text):
    count = 0
    for c in text:
        count = count + 1 if c == "@" else count
    return count

test_df['mention_count'] = test_df.apply(lambda row: count_mentions(row['text']), axis=1)
train_df['mention_count'] = train_raw.apply(lambda row: count_mentions(row['text']), axis=1)

In [36]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target,word_count,unique_count,stop_count,hash_count,mention_count
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,13,13,8,1,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,7,7,0,0,0
2,5,,,All residents asked to 'shelter in place' are ...,1,22,20,11,0,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,8,8,1,1,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,16,15,7,2,0


In [37]:
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
punctuations = string.punctuation

In [38]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens


In [39]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [40]:
# We will use this function to get the best model 
def get_tuned_model(estimator, param_grid, scoring, X, Y):
    from sklearn.model_selection import GridSearchCV

    grid = GridSearchCV(estimator = estimator, 
                       param_grid = param_grid,
                       scoring = scoring,
                       cv=3,
                       n_jobs= -1
                      )

    tuned = grid.fit(X, Y)

    print ("Best score: ", tuned.best_score_) 
    print ("Best params: ", tuned.best_params_)
    print ("IS Score: ", tuned.score(X, Y))
    
    return tuned


In [41]:
def save_results(model, ids, data):
    pred_test = model.predict(data)

    test_res = ids.copy()
    test_res["target"] = pred_test
    test_res.to_csv("my_predictions.csv", index=False)
    return test_res

In [42]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [43]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [44]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df['text'], train_df['target'], test_size=0.3)

X_train = tfidf_vector.fit_transform(X_train)
X_valid = tfidf_vector.transform(X_valid)

In [45]:
print(X_train.shape, X_valid.shape)

(5219, 15483) (2237, 15483)


In [22]:
# tfidf_train = tfidf_vector.fit_transform(X_train['text'])
# tfidf_valid = tfidf_vector.transform(X_valid['text'])

# tmp = pd.DataFrame(tfidf_train.toarray())
# tmp['word_count'] = X_train['word_count'].values
# X_train = tmp

# tmp = pd.DataFrame(tfidf_valid.toarray())
# tmp['word_count'] = X_valid['word_count'].values
# X_valid = tmp 

In [46]:
ids = test_df[['id']]
X_test = tfidf_vector.transform(test_df['text'])

# tfidf_test = tfidf_vector.transform(test_df['text'])
# X_test = pd.concat([test_df['word_count'], pd.DataFrame(tfidf_test.toarray())], axis=1)

In [47]:
print(X_test.shape)

(3263, 15483)


## Modeling

### Logistic Regression

In [49]:
from sklearn import metrics

model = LogisticRegression()
model.fit(X_train, y_train)

predicted = model.predict(X_valid)

# Model Accuracy
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_valid, predicted)) #0.797723292469352
print("Logistic Regression Precision:", metrics.precision_score(y_valid, predicted))
print("Logistic Regression Recall:", metrics.recall_score(y_valid, predicted))

Logistic Regression Accuracy: 0.7921323200715243
Logistic Regression Precision: 0.803921568627451
Logistic Regression Recall: 0.6612903225806451


### GridSearchCV with LogisticRegression

In [50]:
classifier = LogisticRegression()

param_grid = {
    "C":  np.logspace(0, 4, 10),
}

# grd = get_tuned_model(pipe, param_grid, "accuracy", train_df['text'], train_df['target'])
grd = get_tuned_model(classifier, param_grid, "accuracy", X_train, y_train)

# Best score:  0.8168168168168167
# Best params:  {'C': 1291.5496650148827}
# IS Score:  0.963302752293578

Best score:  0.7924902892621161
Best params:  {'C': 2.7825594022071245}
IS Score:  0.9474995209810309


In [120]:
results = save_results(grd, ids, X_test)
results.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


### GridSearchCV with Lasso Regression

In [55]:
classifier = RidgeClassifier()

param_grid = {
    "alpha": np.logspace(-4, -0.5, 30)
}

grd = get_tuned_model(classifier, param_grid, "accuracy", X_train, y_train)



Best score:  0.7811854040394026
Best params:  {'alpha': 0.31622776601683794}
IS Score:  0.9957846330714696
