In [55]:
import pandas as pd
import numpy as np
import nltk
import sklearn
import matplotlib.pyplot as plt
import re
import tqdm
from sklearn import utils

%matplotlib inline

In [98]:
twitter_df = pd.read_csv('twitter_train.csv')
twitter_df = twitter_df.fillna('0')

In [99]:
twitter_df = twitter_df.drop('location', axis=1)
target = twitter_df.target
# twitter_df = twitter_df.drop('target', axis=1)

In [5]:
# CBOW is a word embedding technique to predict middle word in the context.
# hence capturing some semantic information. 

In [10]:
from nltk import TweetTokenizer
wt = TweetTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_corpus(df_1 = twitter_df, text_col = 'text'):
    
    # refining the text by removing the special characters,
    # lower casing all the words, removing white spaces
    
    df = df_1.copy()
    df.dropna(inplace=True)
    
    url_re = r'(https?:\/\/\S+)$'
    tags = r'@([a-zA-Z0-9:]+)\s' # mentions
    english_re = r'([a-zA-Z]\w+)'
    extended_stop_words_re = stop_words + ['&amp;','rt','th','co', 're','ve','kim','daca','p.m.']
    single_letters_re = r'.'
    
    df['preprocessed_'+ text_col] = df[text_col].str.lower() # lower casing the text.
    
    df['preprocessed_'+ text_col] = df['preprocessed_'+ text_col].apply(lambda row: ' '.join([word for word in row.split()
                                                                                             if (re.match(english_re, word))
                                                                                             and (not word in extended_stop_words_re)
                                                                                             and (not word in single_letters_re)]))
    
    # df['preprocessed_'+text] = re.sub(english, '', df['preprocessed_'+text])
    df['preprocessed_'+ text_col] = df['preprocessed_'+text_col].apply(lambda row: re.sub(url_re, '', row)) # removing urls.
    
    # df['preprocessed_'+ text_col] = df['preprocessed_'+text_col].apply(lambda row: re.sub(tags, '', row)) # removing hash-tags.    
    # doc = doc.strip() # removes both the leading and trailing characters.
    
    # tokenize document
    df['tokenised_' + text_col] = df['preprocessed_'+ text_col].apply(lambda row: wt.tokenize(row))
    # df['tokenised_' + text_col].apply(re.sub(single_letters_re, '', row))
    return df


In [100]:
norm_df = normalize_corpus(twitter_df)

In [101]:
norm_df

Unnamed: 0,id,keyword,text,target,preprocessed_text,tokenised_text
0,1,0,Our Deeds are the Reason of this #earthquake M...,1,deeds reason may allah forgive us,"[deeds, reason, may, allah, forgive, us]"
1,4,0,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask. canada,"[forest, fire, near, la, ronge, sask, ., canada]"
2,5,0,All residents asked to 'shelter in place' are ...,1,residents asked place' notified officers. evac...,"[residents, asked, place, ', notified, officer..."
3,6,0,"13,000 people receive #wildfires evacuation or...",1,people receive evacuation orders california,"[people, receive, evacuation, orders, california]"
4,7,0,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby smoke pours school,"[got, sent, photo, ruby, smoke, pours, school]"
...,...,...,...,...,...,...
7608,10869,0,Two giant cranes holding a bridge collapse int...,1,two giant cranes holding bridge collapse nearb...,"[two, giant, cranes, holding, bridge, collapse..."
7609,10870,0,@aria_ahrary @TheTawniest The out of control w...,1,control wild fires california even northern pa...,"[control, wild, fires, california, even, north..."
7610,10871,0,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,m1.94 utc]?5km volcano hawaii.,"[m1, ., 94, utc, ], ?, 5km, volcano, hawaii, .]"
7611,10872,0,Police investigating after an e-bike collided ...,1,police investigating collided car little portu...,"[police, investigating, collided, car, little,..."


In [102]:
# importing the requires libraries to generate word embeddings.
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tokenised_doc = norm_df.tokenised_text
target = norm_df.target

# convert tokenized document into gensim formatted tagged data
tagged_data = [TaggedDocument(d , [i]) for i, d in zip(target, tokenised_doc)]

# tagged_data = [TaggedDocument(d , [i]) for (i, d) in norm_df[["keyword", "tokenised_text"]]]

In [37]:
# tagged_dada_1 = norm_df.apply(lambda r: TaggedDocument(words=r['tokenised_text'], tags = r['keyword']))

KeyError: 'tokenised_text'

In [103]:
tagged_data

[TaggedDocument(words=['deeds', 'reason', 'may', 'allah', 'forgive', 'us'], tags=[1]),
 TaggedDocument(words=['forest', 'fire', 'near', 'la', 'ronge', 'sask', '.', 'canada'], tags=[1]),
 TaggedDocument(words=['residents', 'asked', 'place', "'", 'notified', 'officers', '.', 'evacuation', 'shelter', 'place', 'orders', 'expected'], tags=[1]),
 TaggedDocument(words=['people', 'receive', 'evacuation', 'orders', 'california'], tags=[1]),
 TaggedDocument(words=['got', 'sent', 'photo', 'ruby', 'smoke', 'pours', 'school'], tags=[1]),
 TaggedDocument(words=['update', 'california', 'hwy', '.', 'closed', 'directions', 'due', 'lake', 'county', 'fire'], tags=[1]),
 TaggedDocument(words=['heavy', 'rain', 'causes', 'flash', 'flooding', 'streets', 'manitou', ',', 'colorado', 'springs', 'areas'], tags=[1]),
 TaggedDocument(words=['top', 'hill', 'see', 'fire', 'woods', '...'], tags=[1]),
 TaggedDocument(words=["there's", 'emergency', 'evacuation', 'happening', 'building', 'across', 'street'], tags=[1]),


In [105]:
# initialising doc2vec model weights
model = Doc2Vec(dm = 1, documents = tagged_data, vector_size = 200, min_count = 5, epochs = 50)

# save trained model
model.save('train_doc2vec.model')
# load saved model
model = Doc2Vec.load("train_doc2vec.model")

In [107]:
model.wv.vocab

{'reason': <gensim.models.keyedvectors.Vocab at 0x7fd64b7edf10>,
 'may': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa090>,
 'us': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa110>,
 'forest': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa190>,
 'fire': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa210>,
 'near': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa290>,
 'la': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa310>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa350>,
 'canada': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa3d0>,
 'residents': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa450>,
 'asked': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa4d0>,
 'place': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa550>,
 "'": <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa590>,
 'officers': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa610>,
 'evacuation': <gensim.models.keyedvectors.Vocab at 0x7fd64b7fa690>,
 'shelter': <gensim.models.keyedvectors.Voc

In [106]:
model.corpus_total_words

72070

In [108]:
# the model weights are already initialised.
# Paragraph Vector Distributed memory acts as a memory that remembers what is missing from the 
# current context.
%timeit
for epoch in range(30):
    model.train(tagged_data, total_examples = len(tagged_data), epochs=1)
    model.alpha -=0.002
    model.min_alpha = model.alpha

In [109]:
tagged_data[1000].tags[0]

0

In [110]:
vector = model.infer_vector(['leagues', 'ball', 'olympic', 'level', 'body', 'bagging', 'like', 'career', 'nothing'])

In [111]:
vector.shape

(200,)

In [112]:
def vectors_Doc2vec(model, tagged_docs):
    sents = tagged_docs
    tags, vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return tags, vectors

In [113]:
targets, vectors = vectors_Doc2vec(model, tagged_data)

In [134]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(C = 0.1, n_jobs=1)

log_reg.fit(vectors, targets)
y_pred = log_reg.predict(vectors)

In [135]:
from sklearn.metrics import accuracy_score

accuracy_score(target, y_pred)

0.6628136083015894

In [132]:
y_pred[1000]

0

In [133]:
set(y_pred)

{0, 1}