In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from tqdm import tqdm
from tqdm import tqdm_notebook
import os
%matplotlib inline
from keras.layers import Input, Embedding, LSTM, Dropout, BatchNormalization, Dense, concatenate, Flatten, Conv1D, MaxPool1D, LeakyReLU, ELU, SpatialDropout1D, MaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, load_model
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import pickle

#1.Loading **Data**

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('all') 
stopwords=set(stopwords.words('english'))

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

In [47]:
def fill_missing_values(df):
    
    df['name']=df['name'].fillna('none')
    df['item_description']=df['item_description'].fillna('none')
    df['brand_name']=df['brand_name'].fillna('unknown')
    df['category_name']=df['category_name'].fillna('none')
    return df
def split_text(text):
    if text=='none':
        return ["unknown", "unknown", "unknown"]
    return text.split("/")

def split_categories(df):
    df['main_cat'], df['subcat_1'], df['subcat_2'] = zip(*df['category_name'].apply(lambda x: split_text(x)))
    df = df.drop('category_name', axis=1)
    return df
def counting_stopwords(data):
    count_words=[]
    for i in data['item_description']:
        count=0
        for j in i.split(' '):
            if j in stopwords:
                count+=1
        count_words.append(count)
    data['count_of_stopwords']=count_words
    return data
# ref - www.appliedaicourse.com/
''' This code performs text processing by cleaning text including 
removing stopwords, reemoving special characters, performing, word decontraction etc. '''

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

def text_preprocess(data):
    preprocessed = []
    # tqdm is for printing the status bar
    for sentance in data:
        sent = decontracted(sentance)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\"', ' ')
        sent = sent.replace('\\n', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        # https://gist.github.com/sebleier/554280
        sent = ' '.join(e for e in sent.split() if e not in stopwords)
        preprocessed.append(sent.lower().strip())

    return preprocessed
def get_item_des_len(data):
    data['length']=data['item_description'].str.len()
    return data
    
def brand_name_category(data):
    cat_brandname=[]
    for i in data['brand_name']:
        if i!='unknown':
            cat_brandname.append(1)
        else:
            cat_brandname.append(0)
    data['brand_name_present']=cat_brandname

    return data

In [48]:

def fdata_pipeline(df):
    #this will do all the necessary preprocessing
    print()
    # print("Filling missing values...")
    df = fill_missing_values(df)
    


    # print("Splitting categories...")
    df = split_categories(df)

    # print('counting stop words.....')
    df=counting_stopwords(df)

    # print("pre-processing text data...")
    df['preprocessed_item_des'] = text_preprocess(df['item_description'])


    # print('Getting word lengths')
    df=get_item_des_len(df)
    # print('brand_name_present')
    df=brand_name_category(df)
    

    
    return df

In [56]:
def text_embeddings(text, tokenizer, max_len_doc):
    # Word Tokenizer
    encoded_docs_train = tokenizer.texts_to_sequences(text)
    text_padded = pad_sequences(encoded_docs_train, maxlen=max_len_doc, padding='post')

    return text_padded

    
def categorical_embeddings(cat_data, le):
    cat_data = cat_data.map(lambda s: '<unknown>' if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, '<unknown>')
    encoded_cat = le.transform(cat_data.values)
    
    return encoded_cat

In [57]:
def pre_process_convlstm_input(df, desc_tokenizer, name_tokenizer, desc_max_len_doc, name_max_len_doc, brd_nm, sca_subcat1, sca_subcat2, sca_subcat3,scr_length,
                   scr_stopw,itm_conid,scr_bnpresent,scr_ship):
   #textdata embedding
    desc_text_padded = text_embeddings(df['preprocessed_item_des'].apply(str), desc_tokenizer, desc_max_len_doc)
    name_text_padded = text_embeddings(df['name'].apply(str), name_tokenizer, name_max_len_doc)
   


    #categorical data
    bn_cat = categorical_embeddings(df['brand_name'], brd_nm)
    main_cat = categorical_embeddings(df['main_cat'], sca_subcat1)
    sc2_cat = categorical_embeddings(df['subcat_1'], sca_subcat2)
    sc3_cat = categorical_embeddings(df['subcat_2'], sca_subcat3)
    
    #standard scaler
    a=scr_length.transform(np.array(df['length']).reshape(-1,1))
    b=scr_stopw.transform(np.array(df['count_of_stopwords']).reshape(-1,1))
    c=itm_conid.transform(np.array(df['item_condition_id']).reshape(-1,1))
    d=scr_bnpresent.transform(np.array(df['brand_name_present']).reshape(-1,1))
    e=scr_ship.transform(np.array(df['shipping']).reshape(-1,1))



    #converting above to numerical data
    X_numerical_feat=np.concatenate((a,b,c,d,e),axis=1)

    test_full =[name_text_padded,desc_text_padded,main_cat,sc2_cat,sc3_cat,bn_cat,X_numerical_feat]

    
    return test_full


In [33]:
import tensorflow as tf

model_best=tf.keras.models.load_model('/content/drive/MyDrive/pavn')

In [73]:
def take_main_input(X):
    df = pd.DataFrame(X)
    df.columns = ['name', 'item_condition_id', 'category_name', 'brand_name', 'shipping', 'item_description']
    
    #loading all the pickel files from model
    with open('/content/drive/MyDrive/pavn/tokenizer_main.pickle', 'rb') as handle:
        tokenizer_main = pickle.load(handle)
    with open('/content/drive/MyDrive/pavn/tokenizer1.pickle', 'rb') as handle:
        tokenizer_1 = pickle.load(handle)
    from pickle import load
    sca_subcat1 = load(open('/content/drive/MyDrive/pavn/scalersubcat1.pkl', 'rb'))
    sca_subcat2 = load(open('/content/drive/MyDrive/pavn/scalersubcat2.pkl', 'rb'))
    sca_subcat3 = load(open('/content/drive/MyDrive/pavn/scalersubcat3.pkl', 'rb'))
    brd_nm = load(open('/content/drive/MyDrive/pavn/brand_name.pkl', 'rb'))
    #################################################
    scr_stopw = load(open('/content/drive/MyDrive/pavn/scalar_stopwords.pkl', 'rb'))
    itm_conid = load(open('/content/drive/MyDrive/pavn/scalar_item_con_id.pkl', 'rb'))
    scr_ship = load(open('/content/drive/MyDrive/pavn/scalar_shipping.pkl', 'rb'))
    scr_bnpresent = load(open('/content/drive/MyDrive/pavn/scalar_brandname_pres.pkl', 'rb'))
    scr_length=load(open('/content/drive/MyDrive/pavn/scalar_length.pkl','rb'))



    
    df = fdata_pipeline(df)
    #max length of 253 for item desc and 17 for name is got from the model

    test_full=pre_process_convlstm_input(df,tokenizer_1,tokenizer_main,253,17,brd_nm,sca_subcat1,sca_subcat2,sca_subcat3,scr_length,scr_stopw,itm_conid,scr_bnpresent,scr_ship)
    

    pred=model_best.predict(test_full)
    
   

    pred = np.exp(pred)
    return pred

In [75]:
prediction = take_main_input([["MLB Cincinnati Reds T Shirt Size XL", 3, "Men/Tops/T-shirts", np.nan, 1, "No description yet"]])
print("The product price estimation is $ {}".format(prediction.tolist()[0][0]))


The product price estimation is $ 14.763649940490723


In [17]:
!pip install flask-ngrok

Collecting flask-ngrok
  Downloading https://files.pythonhosted.org/packages/af/6c/f54cb686ad1129e27d125d182f90f52b32f284e6c8df58c1bae54fa1adbc/flask_ngrok-0.0.25-py3-none-any.whl
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [18]:
import flask

In [19]:
from flask import Flask
from flask import request
from flask_ngrok import run_with_ngrok

In [78]:
from flask import  Flask,jsonify,request
app = Flask(__name__)
run_with_ngrok(app)


@app.route('/')
def hello_world():
    return 'Hello World!'


@app.route('/index')
def index():
    return flask.render_template('index.html')


@app.route('/predict', methods=['POST'])
def predict():
    
    

    to_predict_list = request.form.to_dict()
    to_predict_list = list(to_predict_list.values())
    print(to_predict_list)
    
    
    lst1=to_predict_list
    lst2=[]
    for i in range(len(lst1)):
        if i==0:
            lst2.append(lst1[0])
        if i==1:
            lst2.append(int(lst1[1]))
            
        if i==2:
            lst2.append(lst1[2])
        if i==3:
            lst2.append(lst1[3])
        if i==4:
            lst2.append(int(lst1[4]))
        if i==5:
            lst2.append(lst1[5])
            
    print(lst2)
    
    prediction11 = take_main_input([lst2])
    # print(prediction11.tolist()[0][0])

    return flask.render_template('index.html', prediction_text='The price of the product estimated to be $ {}'.format(prediction11.tolist()[0][0]))

    
app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://8579ee9bf895.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [23/Jan/2021 20:33:35] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [23/Jan/2021 20:33:36] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [23/Jan/2021 20:33:39] "[37mGET /index HTTP/1.1[0m" 200 -


['AVA-VIV Blouse', '1', 'Women/Tops &Blouses/Blouse', 'Target', '1', 'Adorable top with a hint of lace and a key hole']
['AVA-VIV Blouse', 1, 'Women/Tops &Blouses/Blouse', 'Target', 1, 'Adorable top with a hint of lace and a key hole']


127.0.0.1 - - [23/Jan/2021 20:33:54] "[37mPOST /predict HTTP/1.1[0m" 200 -



