In [1]:
import os
import re
from collections import Counter

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.manifold import TSNE


In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
#Global variables
DATA_PATH = '../data/input/clean_data'

RANDOM_SEED = 42

In [4]:
#read data
df_train = pd.read_csv(os.path.join(DATA_PATH,'train_cleaned.csv'))
df_test = pd.read_csv(os.path.join(DATA_PATH,'test_cleaned.csv'))

#make common dataframe
df = pd.concat([df_train.drop(columns='target'), df_test.copy()],ignore_index=True, axis=0)

df.head()

Unnamed: 0,id,keyword,location,text,word_count,unique_word_count,stop_word_count,url_count,mean_word_length,char_count,punctuation_count,hashtag_count,mention_count,prep_text,no_stopwords_text
0,1,no_keyword,no_location,Our Deeds are the Reason of this #earthquake M...,13,13,6,0,4.384615,69,1,1,0,our deeds are the reason of this earthquake ma...,deeds reason earthquake may allah forgive us
1,4,no_keyword,no_location,Forest fire near La Ronge Sask. Canada,7,7,0,0,4.571429,38,1,0,0,forest fire near la ronge sask canada,forest fire near la ronge sask canada
2,5,no_keyword,no_location,All residents asked to 'shelter in place' are ...,22,20,11,0,5.090909,133,3,0,0,all residents asked to shelter in place are be...,residents asked shelter place notified officer...
3,6,no_keyword,no_location,"13,000 people receive #wildfires evacuation or...",8,8,1,0,7.125,65,2,1,0,people receive wildfires evacuation orders in ...,people receive wildfires evacuation orders cal...
4,7,no_keyword,no_location,Just got sent this photo from Ruby #Alaska as ...,16,15,7,0,4.5,88,2,2,0,just got sent this photo from ruby alaska as s...,got sent photo ruby alaska smoke wildfires pou...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10661 entries, 0 to 10660
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10661 non-null  int64  
 1   keyword            10661 non-null  object 
 2   location           10661 non-null  object 
 3   text               10661 non-null  object 
 4   word_count         10661 non-null  int64  
 5   unique_word_count  10661 non-null  int64  
 6   stop_word_count    10661 non-null  int64  
 7   url_count          10661 non-null  int64  
 8   mean_word_length   10661 non-null  float64
 9   char_count         10661 non-null  int64  
 10  punctuation_count  10661 non-null  int64  
 11  hashtag_count      10661 non-null  int64  
 12  mention_count      10661 non-null  int64  
 13  prep_text          10661 non-null  object 
 14  no_stopwords_text  10660 non-null  object 
dtypes: float64(1), int64(9), object(5)
memory usage: 1.2+ MB


In [6]:
#for this approach we will use no_stopwords_text and we can drop nans
df.dropna(inplace=True, ignore_index=True)

# Generate variables 
***

    word_to_index : A dictionary mapping each word to an integer value {'modern': 0, 'humans': 1}

    index_to_word : A dictionary mapping each word to an integer value {0: 'modern', 1: 'humans'}

    corpus  : The entire data consisting of all the words

    vocab_size : Number of unique words in the corpus

In [7]:
# corpus
corpus = df['no_stopwords_text'].apply(lambda x: [x.split()]).sum()
corpus_flat = df['no_stopwords_text'].apply(lambda x: x.split()).sum()

#vocab_size
unique_words = np.unique(corpus_flat)
vocab_size = unique_words.shape[0]

#word_to_index & index_to_word
word_to_index = {}
index_to_word = {}

ind = 0
for word in unique_words:
    word_to_index[word] = ind
    index_to_word[ind] = word
    ind+=1

In [9]:
vocab_size

16919

In [1]:
def get_one_hot_vectors(target_word,context_words,vocab_size,word_to_index):
    
    #Create an array of size = vocab_size filled with zeros
    trgt_word_vector = np.zeros(vocab_size)
    
    #Get the index of the target_word according to the dictionary word_to_index. 
    #If target_word = best, the index according to the dictionary word_to_index is 0. 
    #So the one hot vector will be [1, 0, 0, 0, 0, 0, 0, 0, 0]
    index_of_word_dictionary = word_to_index.get(target_word) 
    
    #Set the index to 1
    trgt_word_vector[index_of_word_dictionary] = 1
    
    #Repeat same steps for context_words but in a loop
    ctxt_word_vector = np.zeros(vocab_size)
    
    
    for word in context_words:
        index_of_word_dictionary = word_to_index.get(word) 
        ctxt_word_vector[index_of_word_dictionary] = 1
        
    return trgt_word_vector,ctxt_word_vector

In [2]:
#Note : Below comments for trgt_word_index, ctxt_word_index are with the above sample text for understanding the code flow

def generate_training_data(corpus,window_size,vocab_size,word_to_index):

    training_data =  []
    
    for tweet in corpus:
        
        len_of_tweet = len(tweet)
        #print(len_of_tweet)
        
        if len_of_tweet<3:
            continue
        
        for i,word in enumerate(tweet):
            
            
            
            
            
            index_target_word = i
            target_word = word
            context_words = []

            #when target word is the first word
            if i == 0:  

                # trgt_word_index:(0), ctxt_word_index:(1,2)
                context_words = [tweet[x] for x in range(i + 1 , window_size + 1)] 


            #when target word is the last word
            elif i == len_of_tweet-1:

                # trgt_word_index:(9), ctxt_word_index:(8,7), length_of_corpus = 10
                context_words = [tweet[x] for x in range(len_of_tweet - 2 ,len_of_tweet-2 - window_size  , -1 )]

            #When target word is the middle word
            else:

                #Before the middle target word
                before_target_word_index = index_target_word - 1
                for x in range(before_target_word_index, before_target_word_index - window_size , -1):
                    if x >=0:
                        context_words.extend([tweet[x]])

                #After the middle target word
                after_target_word_index = index_target_word + 1
                for x in range(after_target_word_index, after_target_word_index + window_size):
                    if x < len_of_tweet:
                        context_words.extend([tweet[x]])


            trgt_word_vector,ctxt_word_vector = get_one_hot_vectors(target_word,context_words,vocab_size,word_to_index)
            training_data.append(trgt_word_vector,ctxt_word_vector)
        
    return training_data

In [3]:
window_size = 2
training_data = generate_training_data(corpus,2,vocab_size,word_to_index)

NameError: name 'corpus' is not defined

In [4]:
np.array(training_data)

NameError: name 'np' is not defined

In [71]:
np.concatenate((training_data, training_sample_words))

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])