In [5]:
import numpy as np
import pandas as pd
import string
import regex
import emoji
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])  

In [4]:
data = pd.read_csv("airline_sentiment_analysis.csv")
data

Unnamed: 0.1,Unnamed: 0,airline_sentiment,text
0,1,positive,@VirginAmerica plus you've added commercials t...
1,3,negative,@VirginAmerica it's really aggressive to blast...
2,4,negative,@VirginAmerica and it's a really big bad thing...
3,5,negative,@VirginAmerica seriously would pay $30 a fligh...
4,6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
...,...,...,...
11536,14633,negative,@AmericanAir my flight was Cancelled Flightled...
11537,14634,negative,@AmericanAir right on cue with the delays👌
11538,14635,positive,@AmericanAir thank you we got on a different f...
11539,14636,negative,@AmericanAir leaving over 20 minutes Late Flig...


In [8]:
import keras
import numpy as np
import pandas as pd
import string
import regex
import emoji
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])  
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer, tokenizer_from_json
from keras_preprocessing.sequence import pad_sequences

data = pd.read_csv("airline_sentiment_analysis.csv")
data

class SentimentPrediction():
    
    def __init__ (self, do_training=True, model = None, json_str_tokenizer = None):

        if(model): self.model = model
        if(json_str_tokenizer): self.tokenizer = json_str_tokenizer
        self.do_training = do_training
        
        
    def preprocess_data(self, text_array, label_array=None):
        
        #remove emojis with relevant words
        data = pd.DataFrame()
        data["text"] = text_array
        data["label"] = label_array
       
        data["text"] = data["text"].apply(lambda x:emoji.demojize(x, delimiters=(" ", " ")))

        #remove tags & urls
        data["text"] = data["text"].apply(lambda x: regex.sub(r'@\w+\S', '', x))
        data["text"] = data["text"].apply(lambda x: regex.sub(r'http\S+', '', x))

        # Make text lowercase
        data["text"] = data["text"].apply(lambda x:x.lower())

        # Remove punctuation
        data["text"] = data["text"].apply(lambda x:x.translate(str.maketrans('', '', string.punctuation)))

        #Remove stopwords
        stop_words = stopwords.words('english')
        data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

        #lemmatizing
        data["text"] = data["text"].apply(lambda text:" ".join(token.lemma_ for token in nlp(text)))
        
        data_X = np.array(data["text"])
        
        if(self.do_training): 
            data_Y = np.array(data["label"].map({"positive":1, "negative":0}))   
        else: 
            data_Y = None
            
        return data_X, data_Y
    
    
    def train(
        self, text_array, label_array, 
          max_sent_len = 100, 
          max_words = 1000, 
          batch_size=16, 
          epochs=10, 
          val_split=0.2
         ):
        
        if(not self.do_training): 
            (print("do_training parameter is set to False. Set it to True"))
            return
        
        xtrain, ytrain = self.preprocess_data(text_array, label_array)
        print("Preprocessing Done")
        
        tokenizer = Tokenizer(num_words=max_words)
        tokenizer.fit_on_texts(list(xtrain))
        sequences = tokenizer.texts_to_sequences(list(xtrain))
        tokenized_data = pad_sequences(sequences, maxlen=max_sent_len)

        model = Sequential([
            layers.Embedding(max_words, 40, input_length=max_sent_len),
            layers.Bidirectional(layers.LSTM(20,dropout=0.5)),
            layers.Dense(1,activation='sigmoid'),

        ])

        METRICS = [
          keras.metrics.Precision(name='precision'),
          keras.metrics.Recall(name='recall'),
          keras.metrics.AUC(name='auc'),
        ]

        model.compile(
          optimizer=keras.optimizers.Adam(learning_rate=1e-3),
          loss=keras.losses.BinaryCrossentropy(),
          metrics=METRICS)
        print("MODEL_SUMMARY")
        print(model.summary())
        
        model.fit(
            np.array(tokenized_data), np.array(ytrain),
            batch_size = batch_size, #16,
            epochs = epochs, #10,
            validation_split = val_split, #0.2
        )
        
        return model, tokenizer.to_json()
    
    
    def predict(self, text_array, model_= None, tokenizer_json= None):
        
        test_str, _ = self.preprocess_data(text_array)
        
        
        if(not (model_ and tokenizer_json)):
            model_ = self.model
            tokenizer_json = self.tokenizer
            
        tokenizer_ = tokenizer_from_json(tokenizer_json)
        sequences = tokenizer_.texts_to_sequences(list(test_str))
        tkd = pad_sequences(sequences, maxlen=100)
        pr = model_.predict(tkd) > 0.5
        if(pr[0]==0): return "negative"
        else: return "positive"


In [16]:
dgh = SentimentPrediction()
model, tokenizer_ = dgh.train(list(data["text"]), list(data["airline_sentiment"]))

Preprocessing Done
MODEL_SUMMARY
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 40)           40000     
                                                                 
 bidirectional_1 (Bidirectio  (None, 40)               9760      
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 1)                 41        
                                                                 
Total params: 49,801
Trainable params: 49,801
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:

dgh.predict(["hi"], model, tokenizer_)



'positive'

In [15]:
from fastapi import FastAPI
from pydantic import BaseModel
import pickle
import numpy as np
import pandas as pd
app = FastAPI()

@app.post('/predict')
async def predict_species():

    dgh = SentimentPrediction()
    pred = dgh.predict(["hi"], model, tokenizer_)
    return {
        'prediction': pred
    }