**Given:** 
       * Data contains keyword from tweet, 
       * location from where it was tweeted, 
       * text of the tweet, 
       * Tweet id. 
       * Target which is a binary classifier to determine if a tweet is about real disaster or not.

**Problem:** To predict, based on training data, if a new tweet is related to real disaster or not. 

**Expected output:**
         id, target(0,1)


In [None]:
# add all the packages needed for this project

In [106]:
import numpy as np
import pandas as pd
import re 

from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from gensim.test.utils import datapath
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.keyedvectors import FastTextKeyedVectors
from gensim.models.fasttext import FastText
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, make_scorer
from sklearn.model_selection import KFold, cross_val_score

import re
import string


import fasttext


In [2]:
# To display all rows in the df.

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None )

**Read the training data csv file** 

In [3]:
train_data = pd.read_csv("train.csv", header = 0)

In [4]:
train_data.shape 

# 7613, 5

(7613, 5)

In [5]:
train = train_data.loc[:,['text', 'target']]

In [None]:
train

In [6]:
train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

**We can observe from the text snippet above, the data has**

- punctuations, 
- numbers, 
- letters are both capital and small, 
- stopwords,
- foreign characters, 
- urls and so on. 

We would like to remove all non english characters including the numbers from the text. 


In [7]:
test_data = pd.read_csv("test.csv", header = 0)

In [8]:
test_data.shape

# 3263, 4

(3263, 4)

In [9]:
test = test_data.loc[:,['id','text']]

In [None]:
test

**Data cleaning process**

- Remove numbers
- Remove punctuations
- Remove urls
- Remove whitespace
- Remove stopwords
- Remove non- printable words like \n, \r, \t, and so on.
- Convert text to lower

In [14]:
def cleaned_data(text):
    
   
    # Remove numbers:
    text = re.sub(r'[-+]?[.\d]*[\d]+[:,.\d]*','number',text)
    
    # Remove punctuations:
    punct = str.maketrans('', '', string.punctuation)
    text = text.translate(punct)
    
    # Remove urls
    text = re.sub(r'http\S+','url',text)
    
    # Remove whitespace
    text = ' '.join(text.split())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    
    words_without_sw = [w for w in word_tokens if not w.lower() in stop_words]
    text = ' '.join([str(elem) for elem in words_without_sw])
    
    # Remove non printable words
    text = ''.join([word for word in text if word in string.printable])
    
    # convert text to lower:
    text = text.lower()
    
    return text
    

In [15]:
train['cleaned_txt'] = train['text'].apply(cleaned_data)

In [17]:
test['cleaned_txt'] = test['text'].apply(cleaned_data)

In [129]:
train.head()

Unnamed: 0,text,target,cleaned_txt
0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,deeds reason earthquake may allah forgive us
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,residents asked shelter place notified officers evacuation shelter place orders expected
3,"13,000 people receive #wildfires evacuation orders in California",1,number people receive wildfires evacuation orders california
4,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,got sent photo ruby alaska smoke wildfires pours school


In [18]:
new_train = train.loc[:, ['cleaned_txt', 'target']]
new_test = test.loc[:, ['id', 'cleaned_txt']]

In [40]:
corpus_list = [i.split() for i in new_train.cleaned_txt]


In [43]:
fasttext_model = FastText(corpus_list, 
                          size = 300,
                          min_count = 5, 
                          window = 5, 
                          #min_alpha= 0.01, 
                          sg = 1,
                          workers =1, 
                          sample = 1e-2)
                                        
# Refer : https://radimrehurek.com/gensim/models/fasttext.html

# corpus list has the list of all words from the text
# size = embedding size 
# min_count = The model ignores all words with total frequency lower than this.
# window = The maximum distance between the current and predicted word within a sentence.
# min_alpha = Learning rate will linearly drop to min_alpha as training progresses, default= 0.05
# sg = loss function (ns - nskip gram, or cbow ) if sg=1, then sg otherwise CBOW
# hs = loss function (hs - hierarchical softmax, negative sampling) if hs=1, hs is used or negative sampling.
# sample = The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).


In [121]:
print('vocab size: ', fasttext_model.corpus_total_words)


vocab size:  76308


In [53]:
fasttext_model.wv.most_similar('disaster', topn= 20)

[('obama', 0.9957274198532104),
 ('declares', 0.9790096282958984),
 ('quarantine', 0.9708105325698853),
 ('saipan', 0.9642913341522217),
 ('quarantined', 0.9629697799682617),
 ('typhoondevastated', 0.9628607630729675),
 ('water', 0.960436224937439),
 ('migrant', 0.9564712643623352),
 ('signs', 0.9563822150230408),
 ('liked', 0.9539257287979126),
 ('reddit', 0.9512729048728943),
 ('migrants', 0.9454452395439148),
 ('center', 0.9450538158416748),
 ('view', 0.9450084567070007),
 ('devastated', 0.9426709413528442),
 ('register', 0.9423379898071289),
 ('declaration', 0.9423233866691589),
 ('sister', 0.9412342309951782),
 ('videos', 0.9407011866569519),
 ('minister', 0.9361927509307861)]

In [69]:
fasttext_model.similar_by_word('natural', topn = 20)

  fasttext_model.similar_by_word('natural', topn = 20)


[('ran', 0.9995251893997192),
 ('bbc', 0.9990274310112),
 ('manchester', 0.9989615082740784),
 ('israeli', 0.9989506006240845),
 ('research', 0.9989364147186279),
 ('disney', 0.9989258050918579),
 ('search', 0.9988222122192383),
 ('visit', 0.9986239671707153),
 ('disrupts', 0.9986175894737244),
 ('subreddits', 0.9986174702644348),
 ('signed', 0.99859619140625),
 ('mma', 0.9985911250114441),
 ('hundreds', 0.9984678030014038),
 ('trains', 0.9983665347099304),
 ('horse', 0.9983525276184082),
 ('hunters', 0.9981818199157715),
 ('plane', 0.9981813430786133),
 ('mens', 0.9981039762496948),
 ('israel', 0.9980775117874146),
 ('terror', 0.9980230331420898)]

In [67]:
# calculates cosine similarity between two words

fasttext_model.similarity('disaster', 'tragedy') # 0.8664625


  fasttext_model.similarity('disaster', 'tragedy')


0.8664625

In [68]:
fasttext_model.similarity('natural', 'disaster') # 0.91937894

  fasttext_model.similarity('natural', 'disaster') # 0.91937894


0.91937894

In [72]:
embeddings_df =[]
for i in range(len(new_train)):
    doc_fasttext = fasttext_model.wv.get_vector(new_train.iloc[i,0])
    embeddings_df.append(doc_fasttext)
embeddings_df = pd.DataFrame(embeddings_df)

In [130]:
embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.033558,-0.024403,0.012677,0.015525,0.038067,-0.043989,-0.043146,-0.023853,0.010177,0.003421,...,-0.013144,0.004529,-0.009938,-0.060446,-0.0115,-0.04127,-0.00342,0.031375,-0.008691,-0.083189
1,0.02745,-0.020487,0.01247,0.013416,0.034127,-0.040573,-0.0391,-0.024721,0.010067,0.001231,...,-0.015528,0.005523,-0.008618,-0.053386,-0.006213,-0.040029,-0.006108,0.026671,-0.007091,-0.075635
2,0.036895,-0.024925,0.014663,0.015435,0.044683,-0.053628,-0.0525,-0.032439,0.011942,5.7e-05,...,-0.022521,0.008812,-0.013241,-0.067375,-0.005516,-0.054165,-0.008657,0.038309,-0.008397,-0.096951
3,0.061587,-0.039502,0.027822,0.045688,0.057635,-0.03843,-0.045401,-0.020154,0.00715,0.030538,...,0.017505,-0.01351,-0.001311,-0.112096,-0.047613,-0.060971,-0.004208,0.006519,-0.011585,-0.158597
4,0.034165,-0.023508,0.010573,0.011611,0.040965,-0.048991,-0.04841,-0.027816,0.011852,-0.000834,...,-0.021213,0.008798,-0.012595,-0.058015,-0.00337,-0.044388,-0.007006,0.036265,-0.00846,-0.083251


In [131]:
embeddings_df.shape # 7613, 300

(7613, 300)

In [89]:
mean_embeddings = np.mean(embeddings_df, axis=0)
sd_embeddings = embeddings_df.std(axis=0)

In [90]:
# Scale the embeddings using the normalization formula.
scaled_emb = []
for i in range(300):
    scaled_emb.append((embeddings_df[i] - mean_embeddings[i])/sd_embeddings[i])

scaled_emb_transposed = pd.DataFrame(scaled_emb).transpose()

In [92]:
scaled_emb_transposed.shape #7613, 300

(7613, 300)

In [93]:
X = scaled_emb_transposed
Y = new_train['target']

In [None]:
estimates = []
estimates.append(('LogisticRegression', Pipeline([('LR', LogisticRegression())])))


In [None]:
# training model scores: 
model_scores = {}

p_score = make_scorer(precision_score)
r_score = make_scorer(recall_score)
f1_score = make_scorer(f1_score)
a_score = make_scorer(accuracy_score)


In [None]:
for i in estimates: 
    kfold = KFold(n_splits = 7, shuffle = True, random_state = 4)
    p_scores = cross_val_score(i[1], X, new_train.target, cv = kfold, scoring = p_score)
    r_scores = cross_val_score(i[1], X, new_train.target, cv = kfold, scoring = r_score)
    f1_scores = cross_val_score(i[1], X, new_train.target, cv = kfold, scoring = f1_score)
    a_scores = cross_val_score(i[1], X, new_train.target, cv = kfold, scoring = a_score)
    
    model_scores.update({ i[0]:{'accuracy': a_scores.mean(), 'f1_score':f1_scores.mean(), 
                                'precision': p_scores.mean(), 'recall':r_scores.mean()} })


In [None]:
for i in model_scores:
    print('\n', i)
    print('\n', model_scores[i])

In [None]:
# training and test data

from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(
new_train['cleaned_txt'],
new_train['target'],
test_size = 0.25, # percentage of observations in test data 
random_state = 0)

In [None]:
X_train.reset_index(inplace = True,drop = True)
y_train.reset_index(inplace = True,drop = True)

X_valid.reset_index(inplace = True,drop = True)
y_valid.reset_index(inplace = True,drop = True)

In [51]:
model1 = fasttext.train_unsupervised('cleaned_col.txt', dim = 300, lr =0.1, epoch= 1,wordNgrams =2, loss = 'hs')

In [None]:
model1.save_model('result1.bin')