In [35]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_punctuation,strip_multiple_whitespaces,remove_stopwords
import pandas as pd
from gensim.corpora import Dictionary
import numpy as np
import xgboost
from sklearn.model_selection import train_test_split

In [64]:
# read data into pandas data frame
train_df = pd.read_csv("nlp-getting-started/train.csv")
df = pd.read_csv("nlp-getting-started/train.csv")

In [4]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [58]:
# clean tweets of white spaces, punctuations, stopwords, and make all letters lowercase
custom_filters = [lambda x: x.lower(),
                  strip_multiple_whitespaces,
                  strip_punctuation,
                  remove_stopwords]

def clean_string(row):
    return preprocess_string(row['text'], custom_filters)

In [6]:
# append cleaned tweets to dataframe
train_df['cleaned_tweets'] = train_df.apply(clean_string, axis=1)

train_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_tweets
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deeds, reason, earthquake, allah, forgive]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13, 000, people, receive, wildfires, evacuati..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [7]:
# convert cleaned_tweets column to dictionary
dct = Dictionary(train_df.cleaned_tweets)

In [8]:
print(dct)

Dictionary(21657 unique tokens: ['allah', 'deeds', 'earthquake', 'forgive', 'reason']...)


In [9]:
# convert dictionary to matrix
corpus = [dct.doc2bow(line) for line in train_df.cleaned_tweets]

In [26]:
print(corpus[:])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 2)], [(12, 1), (16, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)], [(22, 1), (25, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)], [(43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1)], [(55, 1), (56, 1), (57, 1)], [(12, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1)], [(56, 1), (63, 1), (64, 1), (65, 1), (66, 1)], [(23, 1), (67, 1), (68, 1), (69, 1), (70, 1)], [(49, 1), (71, 1), (72, 1), (73, 1), (74, 2), (75, 1), (76, 1), (77, 1), (78, 1), (79, 2), (80, 2), (81, 1)], [(49, 1), (80, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1)], [(48, 1), (91, 1), (92, 2), (93, 1)], [(31, 1), (94, 1), (95, 1),

In [11]:
# create model
from gensim.models import TfidfModel
model = TfidfModel(corpus)

In [33]:
model

<gensim.models.tfidfmodel.TfidfModel at 0x26f1fddea48>

In [12]:
# fit model
vector = model[corpus]

In [13]:
print(vector)

<gensim.interfaces.TransformedCorpus object at 0x0000026F1FDE3888>


In [14]:
vector_0 = model[corpus[0]]

In [15]:
print(vector_0)

[(0, 0.4321836454783851), (1, 0.5286228491374555), (2, 0.3304298084620385), (3, 0.5286228491374555), (4, 0.3809845206833893)]


In [16]:
vector_1 = model[corpus[1]]
print(vector_1)

[(5, 0.37946571708677057), (6, 0.28549131098033503), (7, 0.34813905089348585), (8, 0.2946703458851457), (9, 0.5321978341789189), (10, 0.5321978341789189)]


In [30]:
len(corpus)

7613

In [17]:
# The above code does not seem to be correct because it is treating redundent words as unique. Create dictionary of
# frequency counts of unique words
from collections import defaultdict
frequency = defaultdict(int)
for text in train_df.cleaned_tweets:
    for token in text:
        frequency[token] += 1

In [18]:
# convert default dictionary to dictionary
frequency = dict(frequency)


In [19]:
frequency

{'deeds': 2,
 'reason': 20,
 'earthquake': 50,
 'allah': 9,
 'forgive': 2,
 'forest': 66,
 'near': 55,
 'la': 22,
 'ronge': 1,
 'sask': 1,
 'canada': 13,
 'residents': 8,
 'asked': 9,
 'shelter': 7,
 'place': 26,
 'notified': 1,
 'officers': 8,
 'evacuation': 50,
 'orders': 11,
 'expected': 15,
 '13': 22,
 '000': 4,
 'people': 200,
 'receive': 2,
 'wildfires': 10,
 'california': 121,
 'got': 113,
 'sent': 13,
 'photo': 43,
 'ruby': 1,
 'alaska': 7,
 'smoke': 48,
 'pours': 1,
 'school': 68,
 'rockyfire': 4,
 'update': 39,
 'hwy': 10,
 '20': 24,
 'closed': 20,
 'directions': 1,
 'lake': 14,
 'county': 38,
 'cafire': 2,
 'flood': 58,
 'disaster': 157,
 'heavy': 20,
 'rain': 46,
 'causes': 13,
 'flash': 21,
 'flooding': 50,
 'streets': 8,
 'manitou': 1,
 'colorado': 16,
 'springs': 5,
 'areas': 10,
 'm': 298,
 'hill': 7,
 'woods': 2,
 's': 905,
 'emergency': 158,
 'happening': 12,
 'building': 30,
 'street': 24,
 'afraid': 5,
 'tornado': 35,
 'coming': 51,
 'area': 47,
 'died': 28,
 'heat'

# The data above has been trained on the entire data set. We need to split the data into train and test

In [46]:
# split data into train and test sets
train_words, test_words = train_test_split(train_df['cleaned_tweets'], test_size=0.3)

In [41]:
train_words_dct = Dictionary(train_words)

In [43]:
print(train_words_dct)

Dictionary(18530 unique tokens: ['air', 'ambulance', 'crash', 'feared', 'helicopter']...)


In [44]:
train_corpus = [train_words_dct.doc2bow(line) for line in train_words]

In [45]:
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 2)], [(12, 1), (16, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)], [(22, 1), (25, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)], [(43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1)], [(55, 1), (56, 1), (57, 1)], [(12, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1)], [(56, 1), (63, 1), (64, 1), (65, 1), (66, 1)], [(23, 1), (67, 1), (68, 1), (69, 1), (70, 1)], [(49, 1), (71, 1), (72, 1), (73, 1), (74, 2), (75, 1), (76, 1), (77, 1), (78, 1), (79, 2), (80, 2), (81, 1)], [(49, 1), (80, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1)], [(48, 1), (91, 1), (92, 2), (93, 1)], [(31, 1), (94, 1), (95, 1),

In [47]:
train_model = TfidfModel(train_corpus)

In [48]:
model[corpus[0]]

[(0, 0.4321836454783851),
 (1, 0.5286228491374555),
 (2, 0.3304298084620385),
 (3, 0.5286228491374555),
 (4, 0.3809845206833893)]

# All of these analyses are on dense vectors. We need to analyze sparse vectors

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = df.text

In [66]:
vectorizer = CountVectorizer()

In [67]:
X = vectorizer.fit_transform(corpus)

In [68]:
X_shaped = X.shape

In [69]:
X_shaped

(7613, 21637)

In [70]:
vectorizer.vocabulary_

{'our': 14003,
 'deeds': 5490,
 'are': 2192,
 'the': 18669,
 'reason': 15678,
 'of': 13681,
 'this': 18777,
 'earthquake': 6379,
 'may': 12141,
 'allah': 1852,
 'forgive': 7661,
 'us': 19774,
 'all': 1851,
 'forest': 7652,
 'fire': 7439,
 'near': 13122,
 'la': 11091,
 'ronge': 16266,
 'sask': 16611,
 'canada': 3843,
 'residents': 15940,
 'asked': 2312,
 'to': 18971,
 'shelter': 17022,
 'in': 9718,
 'place': 14612,
 'being': 2900,
 'notified': 13423,
 'by': 3698,
 'officers': 13701,
 'no': 13335,
 'other': 13987,
 'evacuation': 6909,
 'or': 13919,
 'orders': 13936,
 'expected': 7014,
 '13': 176,
 '000': 1,
 'people': 14389,
 'receive': 15699,
 'wildfires': 20607,
 'california': 3797,
 'just': 10550,
 'got': 8364,
 'sent': 16870,
 'photo': 14500,
 'from': 7823,
 'ruby': 16379,
 'alaska': 1800,
 'as': 2280,
 'smoke': 17394,
 'pours': 14821,
 'into': 9947,
 'school': 16700,
 'rockyfire': 16219,
 'update': 19722,
 'hwy': 9399,
 '20': 343,
 'closed': 4462,
 'both': 3346,
 'directions': 5819,

In [72]:
print(vectorizer.get_feature_names())



In [73]:
print(X.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# Clean Data From Beginning using scikit learn