In [142]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_punctuation,strip_multiple_whitespaces,remove_stopwords
import pandas as pd
from gensim.corpora import Dictionary
import numpy as np

In [143]:
# read data into pandas data frame
train_df = pd.read_csv("nlp-getting-started/train.csv")

In [144]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [145]:
# clean tweets of white spaces, punctuations, stopwords, and make all letters lowercase
custom_filters = [lambda x: x.lower(),
                  strip_multiple_whitespaces,
                  strip_punctuation,
                  remove_stopwords]

def clean_string(row):
    return preprocess_string(row['text'], custom_filters)

In [146]:
# append cleaned tweets to dataframe
train_df['cleaned_tweets'] = train_df.apply(clean_string, axis=1)

train_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_tweets
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deeds, reason, earthquake, allah, forgive]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13, 000, people, receive, wildfires, evacuati..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [147]:
# convert cleaned_tweets column to dictionary
dct = Dictionary(train_df.cleaned_tweets)

In [148]:
print(dct)

Dictionary(21657 unique tokens: ['allah', 'deeds', 'earthquake', 'forgive', 'reason']...)


In [149]:
# convert dictionary to matrix
corpus = [dct.doc2bow(line) for line in train_df.cleaned_tweets]

In [150]:
print(corpus[:5])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 2)], [(12, 1), (16, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)]]


In [151]:
# create model
from gensim.models import TfidfModel
model = TfidfModel(corpus)

In [152]:
# fit model
vector = model[corpus]

In [153]:
print(vector)

<gensim.interfaces.TransformedCorpus object at 0x00000189807AD1C8>


In [154]:
vector_0 = model[corpus[0]]

In [155]:
print(vector_0)

[(0, 0.4321836454783851), (1, 0.5286228491374555), (2, 0.3304298084620385), (3, 0.5286228491374555), (4, 0.3809845206833893)]


In [163]:
vector_1 = model[corpus[1]]
print(vector_1)

[(5, 0.37946571708677057), (6, 0.28549131098033503), (7, 0.34813905089348585), (8, 0.2946703458851457), (9, 0.5321978341789189), (10, 0.5321978341789189)]


In [156]:
# The above code does not seem to be correct because it is treating redundent words as unique. Create dictionary of
# frequency counts of unique words
from collections import defaultdict
frequency = defaultdict(int)
for text in train_df.cleaned_tweets:
    for token in text:
        frequency[token] += 1

In [157]:
# convert default dictionary to dictionary
frequency = dict(frequency)


In [162]:
frequency

{'deeds': 2,
 'reason': 20,
 'earthquake': 50,
 'allah': 9,
 'forgive': 2,
 'forest': 66,
 'near': 55,
 'la': 22,
 'ronge': 1,
 'sask': 1,
 'canada': 13,
 'residents': 8,
 'asked': 9,
 'shelter': 7,
 'place': 26,
 'notified': 1,
 'officers': 8,
 'evacuation': 50,
 'orders': 11,
 'expected': 15,
 '13': 22,
 '000': 4,
 'people': 200,
 'receive': 2,
 'wildfires': 10,
 'california': 121,
 'got': 113,
 'sent': 13,
 'photo': 43,
 'ruby': 1,
 'alaska': 7,
 'smoke': 48,
 'pours': 1,
 'school': 68,
 'rockyfire': 4,
 'update': 39,
 'hwy': 10,
 '20': 24,
 'closed': 20,
 'directions': 1,
 'lake': 14,
 'county': 38,
 'cafire': 2,
 'flood': 58,
 'disaster': 157,
 'heavy': 20,
 'rain': 46,
 'causes': 13,
 'flash': 21,
 'flooding': 50,
 'streets': 8,
 'manitou': 1,
 'colorado': 16,
 'springs': 5,
 'areas': 10,
 'm': 298,
 'hill': 7,
 'woods': 2,
 's': 905,
 'emergency': 158,
 'happening': 12,
 'building': 30,
 'street': 24,
 'afraid': 5,
 'tornado': 35,
 'coming': 51,
 'area': 47,
 'died': 28,
 'heat'

In [158]:
# create an array of keys from dictionary
keys = list(frequency.keys())

In [159]:
# Create matrix
frequency_matrix = np.array([[frequency[i]] for i in keys])

In [160]:
frequency_matrix

array([[ 2],
       [20],
       [50],
       ...,
       [ 1],
       [ 1],
       [ 1]])

In [161]:
model_1 = TfidfModel(frequency_matrix)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [127]:
corpus_1 = [frequency.doc2bow(line) for line in train_df.cleaned_tweets]

AttributeError: 'dict' object has no attribute 'doc2bow'