In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
amazon = pd.read_csv('/kaggle/input/sentiment-labelled-sentences/sentiment labelled sentences/amazon_cells_labelled.txt', 
                     sep='\t',
                     lineterminator='\n',
                     names=['sentance', 'sentiment'] )
imdb = pd.read_csv('/kaggle/input/sentiment-labelled-sentences/sentiment labelled sentences/imdb_labelled.txt', 
                     sep='\t|\s{3}',
                     lineterminator='\n',
                     names=['sentance', 'sentiment'] )
yelp = pd.read_csv('/kaggle/input/sentiment-labelled-sentences/sentiment labelled sentences/yelp_labelled.txt', 
                     sep='\t',
                     lineterminator='\n',
                     names=['sentance', 'sentiment'] )
print(f"amazon = {amazon.shape}, imdb = {imdb.shape}, yelp = {yelp.shape}")

In [None]:
data = pd.DataFrame(data=np.concatenate([amazon,imdb,yelp], axis=0), columns=['sentance', 'sentiment'])
data.shape

In [None]:
data

In [None]:
data['sentance'] = data['sentance'].str.lower()
data

In [None]:
!pip install pyspellchecker
from spellchecker import SpellChecker
sc = SpellChecker()

In [None]:
import string

def remove_punc(sentance):
    return sentance.translate(str.maketrans('','',string.punctuation))

print(remove_punc('In this sentance, our job, my lord, is to remove all the punctuation!'))

In [None]:
def correct(sentance):
    misspelled = sc.unknown(str(sentance).split())
    correct = [sc.correction(w) if w in misspelled else w for w in sentance.split()]
    return " ".join(filter(lambda c : c is not None, correct))

print(correct('its spellinf timr'))

In [None]:
from nltk.corpus import stopwords

def remove_stop(sentance):
    correct = [w for w in sentance.split() if w not in stopwords.words('english')]
    return " ".join(correct)

print(remove_stop('i know there is no longer stop, so why even bother'))

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def stemming(sentance):
    correct = [stemmer.stem(w) for w in sentance.split()]
    return " ".join(correct)

print(stemming('And now it is time to finally stem this hell of a sentance, am I right, fellas?'))

In [None]:
from nltk import ngrams

def split_into_ngrams(sentance, n):
    ng = ngrams(sentance, n)
    return [g for g in ng]
    
np.asarray(split_into_ngrams("Wow, I can't believe there are ngrams here!".split(), 5))

In [None]:
x = data['sentance']
y = data['sentiment']
x

In [None]:
x = x.map(lambda x : remove_punc(x))


In [None]:
x = x.map(lambda x : correct(x))
x

In [None]:
x = x.map(lambda x : remove_stop(x))
x

In [None]:
x = x.map(lambda x : stemming(x))
x

In [None]:
def replace_numbers(sentance):
    return ['' if str(s).isdigit() else s for s in sentance]

In [None]:
tokens_x = x.map(lambda x : x.split())
lens = [len(l) for l in tokens_x]
m = np.max(lens)
print(m)

In [None]:
padded = tokens_x.map(lambda x : np.pad(x, (0, m - len(x)), 'constant', constant_values=0))
padded = padded.map(lambda x : replace_numbers(x))
padded

In [None]:
n = 10
ngram_x = padded.map(lambda x : split_into_ngrams(x, n))
ngram_x[0]

In [None]:
ngram_x

In [None]:
from keras.layers import TextVectorization
import tensorflow as tf
data_to_adapt = tf.constant(x)

n = 5
vectorizer = TextVectorization(output_mode='tf-idf', ngrams=5)
vectorizer.adapt(data_to_adapt)

In [None]:
vectorizer(x[0]).numpy()

In [None]:
from keras.layers import Input, Dropout, Dense, Activation
from keras import Model
from keras.utils import plot_model


inputs = Input(shape=(vectorizer.vocabulary_size(), ))
layer = Dense(4096, activation='relu')(inputs)
layer = Dropout(0.5)(layer)
layer = Dense(1024, activation='relu')(layer)
layer = Dropout(0.5)(layer)
layer = Dense(512, activation='relu')(layer)
layer = Dropout(0.5)(layer)
layer = Dense(128, activation='relu')(layer)
layer = Dropout(0.5)(layer)
layer = Dense(64, activation='relu')(layer)
layer = Dropout(0.5)(layer)
outputs = Dense(1, activation='sigmoid')(layer)

model = Model(inputs, outputs)
plot_model(model, show_shapes=True)

In [None]:
from keras.optimizers import Adam
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['accuracy'])

In [None]:
from sklearn.model_selection import train_test_split

x = x.map(lambda x : vectorizer(x).numpy())

In [None]:
x = np.asarray([e.astype('float32') for e in x])

In [None]:
y = np.asarray(y, dtype='float32')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=1234)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1234)

In [None]:
model.fit(x=x_train, y=y_train, validation_data=[x_val, y_val], epochs=20)

In [None]:
model.evaluate(x_test, y_test)