In [None]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sb
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve,roc_auc_score,auc
from sklearn.metrics import confusion_matrix, classification_report


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import gensim
import gensim.downloader

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Activation, Dropout, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import plot_model
  


import warnings
warnings.filterwarnings('ignore')

In [None]:
glove_vectors = gensim.downloader.load('glove-twitter-200')

In [None]:
data = pd.read_csv(r"training.1600000.processed.noemoticon.csv",encoding='latin-1',)

In [None]:
data.shape

In [None]:
data.rename(columns = {"0":"target","1467810369":"id","Mon Apr 06 22:19:45 PDT 2009" :"date","NO_QUERY":"query","_TheSpecialOne_":"user","@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D":"text"},inplace=True)

In [None]:
data.head(10)

In [None]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [None]:
data["target"] = data["target"].apply(lambda x: decode_sentiment(x))

In [None]:
data["target"].value_counts()

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def cleaning(x):
    l=[]
    x=word_tokenize(x)
    for i in x:
        if i not in stop_words and i.isalpha() and not i.startswith("@") and not i.startswith("@") and i!="http" and i!="RT":
            l.append(i.lower())
    return " ".join(l)

In [None]:
data.text=data.text.apply(cleaning)

In [None]:
data2=data.copy()

In [None]:
data2.head()

In [None]:
train, test = train_test_split(data2,test_size = 0.2,random_state=42)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

In [None]:
x_train = pad_sequences(tokenizer.texts_to_sequences(train.text), maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(test.text), maxlen=300)

In [None]:
labels = list(train.target.unique())
labels.append("NEUTRAL")
labels

In [None]:
encoder = LabelEncoder()
encoder.fit(list(train.target))

y_train = encoder.transform(train.target.tolist())
y_test = encoder.transform(test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [None]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print()
print("x_test", x_test.shape)
print("y_test", y_test.shape)

In [None]:
embedding_matrix = np.zeros((vocab_size, 200))
for word, i in tokenizer.word_index.items():
    if word in glove_vectors.key_to_index.keys():
        
        embedding_matrix[i] = glove_vectors.word_vec(word)

print(embedding_matrix.shape)

In [None]:
embedding_layer = Embedding(vocab_size, 200, weights=[(embedding_matrix)], input_length=300, trainable=False)

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(64,activation ="relu"))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [None]:
plot_model(model)

In [None]:
history = model.fit(x_train, y_train,
                    batch_size=1024,
                    epochs=10,
                    validation_split=0.1,
                    verbose=1)

In [None]:
y_predict_prob = model.predict(x_test)

In [None]:
y_predict=np.where(y_test_predict_prob>0.5,1,0)

In [None]:
sb.heatmap(confusion_matrix(y_test, y_predict), annot = True,fmt=".2f")

In [None]:
print(classification_report(y_test, y_predict))

In [None]:
fpr, tpr, thresholds= roc_curve(y_test,y_predict_prob.flatten())

In [None]:
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

In [None]:
score = model.evaluate(x_test, y_test, batch_size=1024)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

### Saving

In [None]:
model.save("sentiment")

In [None]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Importing

In [None]:
from tensorflow import keras
model = keras.models.load_model('sentiment')

In [None]:
import pickle
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

### Prediction

In [None]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = "NEUTRAL"
        if score <= 0.4:
            label = "NEGATIVE"
        elif score >=0.7:
            label = "POSITIVE"

        return label
    else:
        return "NEGATIVE" if score < 0.5 else "POSITIVE"

In [None]:
def predict(text, include_neutral=True):
    
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=300)
    # Predict
    
    score = model.predict([x_test])[0]
    
    
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)

    return {"label": label, "score": float(score) } 

In [None]:
predict("this is hypothethicalject",include_neutral=True)