In [4]:
import pandas as pd
import numpy as np
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import nltk
import re
import multiprocessing
import tensorflow as tf
from sklearn.model_selection import train_test_split
nltk.download("stopwords")

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# setting the names of the columns into col_names

col_names = ["target", "ids", "date", "flag", "user", "text"]
# reading the data from the path and using ISO-8859-1 encoding to decode it.

df = pd.read_csv('input/tweetsdata.csv',
            encoding = "ISO-8859-1",
            names=col_names)

In [6]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
df = df.sample(160000) # taking 160k rows from the data
df.head()

Unnamed: 0,target,ids,date,flag,user,text
75279,0,1695109631,Mon May 04 05:13:48 PDT 2009,NO_QUERY,tigrous4ever,@pixieoncb haha WTF that is very random of Mrs...
1171237,4,1980467897,Sun May 31 06:49:56 PDT 2009,NO_QUERY,pvesey,Good morning everyone
584283,0,2215224130,Wed Jun 17 17:50:53 PDT 2009,NO_QUERY,BastaYaGuate,and excuses by the writing but I do not speak ...
886335,4,1686707249,Sun May 03 06:29:08 PDT 2009,NO_QUERY,c_cooper88,@alibelle yes thanks why am I very odd then?
1257244,4,1997674986,Mon Jun 01 17:15:44 PDT 2009,NO_QUERY,AlannaLessThan3,My favorite mascara finally dried up..*cry*. B...


In [8]:
df.info() # shows the columns, and their types.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160000 entries, 75279 to 1505125
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   target  160000 non-null  int64 
 1   ids     160000 non-null  int64 
 2   date    160000 non-null  object
 3   flag    160000 non-null  object
 4   user    160000 non-null  object
 5   text    160000 non-null  object
dtypes: int64(2), object(4)
memory usage: 8.5+ MB


In [9]:
df.shape

(160000, 6)

In [10]:
df['target']=df['target'].replace(4,1) 

In [11]:
data = df['text']
labels = np.array(df['target'])

In [12]:
data.iloc[1] # displaying a row of the data

'Good morning everyone '

In [13]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

#Removing the stopwords from text
def split_into_words(text):
    # split into words by white space
    words = text.split()
    return words

def to_lower_case(words):
    # convert to lower case
    words = [word.lower() for word in words]
    return words

def remove_punctuation(words):
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    stripped = [re_punc.sub('', w) for w in words]
    return stripped

def keep_alphabetic(words):
    # remove remaining tokens that are not alphabetic
    words = [word for word in words if word.isalpha()]
    return words

def remove_stopwords(words):
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words

def to_sentence(words):
    # join words to a sentence
    return ' '.join(words)
def tweet(words):
    tweet_tokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True,reduce_len=True)
    tweet = tweet_tokenizer.tokenize(words)
    return tweet
        

#Removing the noisy text
def denoise_text(text):
    words = split_into_words(text)
    words = to_lower_case(words)
    words = remove_punctuation(words)
    words = keep_alphabetic(words)
    words = remove_stopwords(words)
    return to_sentence(words)

In [14]:
data = data.apply(denoise_text)

In [15]:
print('Before: {}'. format(list(df['text'][:2])))
print('---')
print('After: {}'. format(list(data[:2])))

Before: ['@pixieoncb haha WTF that is very random of Mrs Vall but then again you were quite sick ', 'Good morning everyone ']
---
After: ['pixieoncb haha wtf random mrs vall quite sick', 'good morning everyone']


In [16]:
# Split in training validation and test sets
X_train, X_test, y_train, y_test = test = train_test_split(data, labels,test_size=0.20,
                                                           random_state=1,
                                                           stratify = labels)

In [17]:
# lets take 10k words in num_words
tokenizer = Tokenizer(num_words=10000, oov_token = '<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
VOCAB_SIZE = len(word_index)+1
VOCAB_SIZE

116369

In [18]:
maxlen = max([len(x) for x in X_train]) # return 175 which is too big for tweets data.
maxlen = 50

In [19]:
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded_sequences = pad_sequences(train_sequences,maxlen=maxlen,padding='post',truncating='post')
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded_sequences = pad_sequences(test_sequences,maxlen=maxlen,padding='post',truncating='post')

In [20]:
len(train_padded_sequences[0])

50

In [21]:
embedding_dim = 32

In [22]:
model = tf.keras.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE+1, embedding_dim, input_length=maxlen),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.30),
        tf.keras.layers.Dense(embedding_dim,activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.30),
        tf.keras.layers.Dense(8,activation='relu'),
        tf.keras.layers.Dense(1,activation='sigmoid'),
    ])

model.compile(loss = 'binary_crossentropy',
                optimizer = 'adam',
                metrics = ['accuracy'])

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 32)            3723840   
                                                                 
 bidirectional (Bidirectiona  (None, 32)               6272      
 l)                                                              
                                                                 
 flatten (Flatten)           (None, 32)                0         
                                                                 
 batch_normalization (BatchN  (None, 32)               128       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense (Dense)               (None, 32)                1

In [24]:
history =    model.fit(train_padded_sequences,
                                               y_train,
                                               validation_data = (test_padded_sequences, y_test),
                                               epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [30]:
print(X_test.iloc[9],'label: ;',y_test[9])

ohken pity walmart around corner every walmart germany closed label: ; 0


In [65]:

keras_file='models/SavedModel.h5'
keras_path=''

In [66]:
tf.keras.models.save_model(model, keras_file)

In [76]:
#!pip3 install h5py
import tensorflow as tf
from tensorflow import keras
model = keras.models.load_model(keras_file)
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.experimental_new_converter=True
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
tf.lite.OpsSet.SELECT_TF_OPS]

tfmodel = converter.convert()
open('sentimental_model.tflite', 'wb').write(tfmodel)
#import tensorflow as tf

# Convert the model
#converter = tf.lite.TFLiteConverter.from_saved_model('models') # path to the SavedModel directory
#tflite_model = converter.convert()
#open('model.tflite', 'wb').write(tflite_model)




INFO:tensorflow:Assets written to: C:\Users\USER\AppData\Local\Temp\tmp8wcjwxj_\assets


INFO:tensorflow:Assets written to: C:\Users\USER\AppData\Local\Temp\tmp8wcjwxj_\assets


3751440