In [1]:
import pandas as pd
import numpy as np
import os,sys
import re
pwd = os.getcwd()

In [36]:
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

In [3]:
# reading dataset
kb_df = pd.read_csv(pwd+"//kabitakitchen.csv")
kb_df

Unnamed: 0,id,commentText,Labels
0,Ugy_CBm-_CKA3YqrzcB4AaABAg,Pudina ptta nhi dalu to,7
1,Ugy9mx9nuTWJu4dRac14AaABAg,Chiken kacha tu ni rhy ga sis,7
2,Ugz8T2MKLYucL3dM9nh4AaABAg,"Hello mam, I love your all recipes.... 😋😋😋\nAl...",4
3,Ugx_1cCjRbCaDgL0FLF4AaABAg,Its awesome recipe plzz make handi chicken in ...,2
4,UgzLhKVAJ6NN3nZXyjN4AaABAg,Yeh jo measurement hai.........kitne logon ke ...,7
...,...,...,...
4895,UgjFXyC0Qhzk5ngCoAEC,i love chole...thank you kabitaji for sharing ...,1
4896,UghP3bitlJuM13gCoAEC,thnakyou mm,1
4897,UghztLZOqvedfXgCoAEC,thanks mam,1
4898,UggX5Fi2Y430zXgCoAEC,u r fabulous,4


In [4]:
kb_df.shape

(4900, 3)

In [5]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [6]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [7]:
comments = []
sentences = list(kb_df['commentText'])
for sen in sentences:
    comments.append(preprocess_text(sen))

In [8]:
comments

['Pudina ptta nhi dalu to',
 'Chiken kacha tu ni rhy ga sis',
 'Hello mam love your all recipes All the ingredients are easily available and your way of explaining is too good ',
 'Its awesome recipe plzz make handi chicken in handi ',
 'Yeh jo measurement hai kitne logon ke liye hai ',
 'Kabita mam tried ur egg biryani everyone in my house just loved it thank so much that was so delicious it was all because of ur recipe',
 'cooker me kar sakte he na',
 'Mujhe bhot ache lagi apki respi mene subscribe kardia bhot ache he',
 'Mam dahi jgh kuch or use kr skte kya',
 'Wooooooo it very yummmmmm love it',
 'This is perfect biryani recipe Apko follow kar banaya acchi bani biryani ',
 'Hi Didi was always curious that How Biryani Made Thank you so much for putting this detailed video This Weekend will try and serve it to family Really Motivatied ',
 'thanx respect from Madam appne tel nahy dala ',
 'I made this it taste awesome thank you kabita ji ',
 'You re amazing ',
 'nice video',
 'Aur kya

In [9]:
print(kb_df.columns.values)

['id' 'commentText' 'Labels']


In [10]:
kb_df['Labels'].unique()

array([7, 4, 2, 5, 1, 3, 6], dtype=int64)

### Creating a BERT Tokenizer

In [37]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
print("1")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=False)
print("2")
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
print("3")
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
print("4")
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)
print("5")

1
2
3
4
5


In [43]:
# checking tokenization
tokenizer.tokenize("don-t be so judgmental")

['don', '-', 't', 'be', 'so', 'judgment', '##al']

In [42]:
# checking tokens assigning with ids
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("don-t be so judgmental"))

[2123, 1011, 1056, 2022, 2061, 8689, 2389]

In [44]:
# function to create tokens for comments
def tokenize_reviews(comment_reviews):
    return [tokenizer.tokenize(comment_reviews),tokenizer.convert_tokens_to_ids(tokenizer.tokenize(comment_reviews))]

In [45]:
# applying function of tokenize_reviews on comments
tokenized_comments = [tokenize_reviews(comment) for comment in comments]

### Prerparing Data For Training

In [46]:
comments_with_len = [[comment, kb_df['Labels'][i], len(comment)] for i, comment in enumerate(tokenized_comments)]

In [47]:
comments_with_len

[[[['pu', '##dina', 'pt', '##ta', 'nh', '##i', 'dal', '##u', 'to'],
   [16405, 18979, 13866, 2696, 18699, 2072, 17488, 2226, 2000]],
  7,
  2],
 [[['chi', '##ken', 'ka', '##cha', 'tu', 'ni', 'r', '##hy', 'ga', 'sis'],
   [9610, 7520, 10556, 7507, 10722, 9152, 1054, 10536, 11721, 24761]],
  7,
  2],
 [[['hello',
    'ma',
    '##m',
    'love',
    'your',
    'all',
    'recipes',
    'all',
    'the',
    'ingredients',
    'are',
    'easily',
    'available',
    'and',
    'your',
    'way',
    'of',
    'explaining',
    'is',
    'too',
    'good'],
   [7592,
    5003,
    2213,
    2293,
    2115,
    2035,
    19328,
    2035,
    1996,
    12760,
    2024,
    4089,
    2800,
    1998,
    2115,
    2126,
    1997,
    9990,
    2003,
    2205,
    2204]],
  4,
  2],
 [[['its',
    'awesome',
    'recipe',
    'pl',
    '##zz',
    'make',
    'hand',
    '##i',
    'chicken',
    'in',
    'hand',
    '##i'],
   [2049,
    12476,
    17974,
    20228,
    13213,
    2191,
  

In [48]:
# sorting the dataset with incresing order of tokenized length
comments_with_len.sort(key=lambda x: x[2])

In [49]:
comments_with_len

[[[['pu', '##dina', 'pt', '##ta', 'nh', '##i', 'dal', '##u', 'to'],
   [16405, 18979, 13866, 2696, 18699, 2072, 17488, 2226, 2000]],
  7,
  2],
 [[['chi', '##ken', 'ka', '##cha', 'tu', 'ni', 'r', '##hy', 'ga', 'sis'],
   [9610, 7520, 10556, 7507, 10722, 9152, 1054, 10536, 11721, 24761]],
  7,
  2],
 [[['hello',
    'ma',
    '##m',
    'love',
    'your',
    'all',
    'recipes',
    'all',
    'the',
    'ingredients',
    'are',
    'easily',
    'available',
    'and',
    'your',
    'way',
    'of',
    'explaining',
    'is',
    'too',
    'good'],
   [7592,
    5003,
    2213,
    2293,
    2115,
    2035,
    19328,
    2035,
    1996,
    12760,
    2024,
    4089,
    2800,
    1998,
    2115,
    2126,
    1997,
    9990,
    2003,
    2205,
    2204]],
  4,
  2],
 [[['its',
    'awesome',
    'recipe',
    'pl',
    '##zz',
    'make',
    'hand',
    '##i',
    'chicken',
    'in',
    'hand',
    '##i'],
   [2049,
    12476,
    17974,
    20228,
    13213,
    2191,
  

In [50]:
# taking only tokenized comments and label of comment
sorted_comments_labels = [(comment_lab[0], comment_lab[1]) for comment_lab in comments_with_len]

In [52]:
#sorted_comments_labels

Once the reviews are sorted we will convert thed dataset so that it can be used to train TensorFlow 2.0 models. we are running the following code to convert the sorted dataset into a TensorFlow 2.0-compliant input dataset shape.

In [58]:
bert_df = pd.DataFrame()
n = 0
for row in sorted_comments_labels:
    col = row[0][0]
    val = row[0][1]
    i = len(col)
    #print(col,val,i,n)
    for x in range(i):
        bert_df.loc[n, col[x]] = val[x]
    n+=1

In [62]:
# remove null values and replace with zeroes
bert_df.fillna(0,inplace=True)

In [63]:
bert_df.to_csv(pwd+'//bert_dataset_kabita.csv',index=False)

In [65]:
# extracting labels from sorted list
label_list = []
for row in sorted_comments_labels:
    label_list.append(row[-1])
print(label_list)

[7, 7, 4, 2, 7, 5, 7, 2, 7, 5, 2, 1, 5, 5, 4, 3, 7, 7, 7, 4, 1, 7, 7, 7, 4, 7, 4, 7, 7, 4, 4, 2, 4, 4, 3, 4, 2, 6, 3, 2, 2, 2, 4, 3, 3, 7, 7, 5, 7, 7, 7, 7, 1, 6, 7, 2, 7, 5, 7, 6, 5, 6, 7, 5, 3, 4, 3, 7, 7, 5, 7, 7, 4, 7, 5, 2, 7, 5, 1, 2, 4, 5, 7, 7, 7, 6, 5, 5, 5, 5, 2, 7, 6, 5, 4, 7, 2, 6, 4, 5, 7, 7, 7, 7, 4, 7, 6, 4, 6, 4, 3, 7, 4, 2, 3, 4, 2, 5, 7, 5, 6, 4, 2, 6, 7, 2, 6, 5, 6, 6, 7, 5, 4, 5, 4, 7, 6, 7, 7, 7, 5, 7, 1, 4, 6, 1, 2, 2, 5, 7, 6, 7, 7, 7, 3, 5, 7, 7, 5, 4, 7, 4, 1, 4, 7, 6, 6, 5, 6, 7, 2, 4, 7, 7, 4, 5, 6, 5, 6, 7, 7, 7, 6, 7, 6, 7, 2, 7, 6, 7, 7, 6, 7, 4, 7, 6, 1, 3, 2, 2, 7, 7, 6, 4, 3, 6, 5, 7, 6, 6, 3, 3, 3, 7, 5, 6, 1, 2, 6, 6, 7, 4, 7, 1, 7, 5, 2, 1, 7, 5, 7, 1, 3, 7, 7, 5, 6, 2, 7, 3, 5, 4, 5, 4, 3, 5, 7, 6, 6, 1, 1, 2, 3, 5, 7, 7, 6, 7, 7, 1, 7, 7, 2, 2, 1, 2, 2, 5, 4, 4, 4, 4, 3, 7, 7, 7, 7, 7, 7, 5, 7, 7, 4, 1, 5, 7, 5, 3, 7, 3, 7, 7, 2, 4, 5, 7, 5, 6, 3, 7, 6, 5, 3, 6, 7, 4, 7, 2, 6, 6, 1, 6, 4, 7, 3, 5, 6, 5, 3, 3, 6, 7, 7, 4, 2, 4, 7, 2, 2, 7, 7, 4, 6, 

In [67]:
# converting list to dataframe
label_df = pd.DataFrame(label_list, columns =['kabita_labels'])
label_df

Unnamed: 0,kabita_labels
0,7
1,7
2,4
3,2
4,7
...,...
4895,1
4896,1
4897,1
4898,4


In [69]:
label_df.to_csv(pwd+'//bert_dataset_kabita_labels.csv',index=False)

### Splitting data

In [70]:
# importing algorithms
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [71]:
x_train,x_test,y_train,y_test = train_test_split(bert_df,label_df,test_size=0.30,
                                                 random_state=21,stratify=label_df)

### Logistic regression for bert transformer data

In [72]:
### Logistic regression for BERT Model
# applying logistic regression
bert_lr_model = LogisticRegression()
bert_lr_model.fit(x_train,y_train)
bert_lr_pred_val = bert_lr_model.predict(x_test)
bert_lr_model.score(x_test,y_test)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7244897959183674

### KNN for Bert transformer data

In [74]:
# applying KNN algorithm
# taking squareroot(n) as k value. as total rows are 4900, k-value = 70
bert_knn_model = KNeighborsClassifier(n_neighbors=70)
bert_knn_model.fit(x_train,y_train)
bert_knn_pred_val = bert_knn_model.predict(x_test)
bert_knn_model.score(x_test,y_test)

  return self._fit(X, y)


0.3952380952380952

In [76]:
# checking the accuracies of neighbors in [3, 4, 5, 6, 7, 8] for KNN
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    bert_knn_model = KNeighborsClassifier(n_neighbors=x)
    bert_knn_model.fit(x_train,y_train)
    print("Accuracy of KNN model for {} neighbors is:".format(str(x)), 
          bert_knn_model.score(x_test,y_test))

  return self._fit(X, y)


Accuracy of KNN model for 3 neighbors is: 0.5585034013605442


  return self._fit(X, y)


Accuracy of KNN model for 4 neighbors is: 0.5503401360544218


  return self._fit(X, y)


Accuracy of KNN model for 5 neighbors is: 0.5598639455782313


  return self._fit(X, y)


Accuracy of KNN model for 6 neighbors is: 0.5476190476190477


  return self._fit(X, y)


Accuracy of KNN model for 7 neighbors is: 0.5503401360544218


  return self._fit(X, y)


Accuracy of KNN model for 8 neighbors is: 0.5428571428571428


### Applying PCA 

In [77]:
# importing library for PCA
from sklearn.decomposition import PCA

In [78]:
# Doing  PCA giving number of Components(dimensions). checking by reducing to 5 components
pca_bert=PCA(n_components=500)
x_pca_comp=pca_bert.fit_transform(bert_df)

In [80]:
# splitting the dataset
pca_xtrain,pca_xtest,pca_ytrain,pca_ytest=train_test_split(x_pca_comp,label_df, test_size=0.30, random_state=24)

In [81]:
### Logistic regression for BERT Model
# applying logistic regression
bert_pca_lr_model = LogisticRegression()
bert_pca_lr_model.fit(pca_xtrain,pca_ytrain)
bert_pca_lr_pred_val = bert_pca_lr_model.predict(pca_xtest)
bert_pca_lr_model.score(pca_xtest,pca_ytest)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7068027210884353

In [82]:
# checking the accuracies of neighbors in [3, 4, 5, 6, 7, 8] for KNN
neighbors_list = [3, 4, 5, 6, 7, 8]
for x in neighbors_list:
    bert_pca_knn_model = KNeighborsClassifier(n_neighbors=x)
    bert_pca_knn_model.fit(pca_xtrain,pca_ytrain)
    print("Accuracy of PCA KNN model for {} neighbors is:".format(str(x)), 
          bert_pca_knn_model.score(pca_xtest,pca_ytest))

  return self._fit(X, y)


Accuracy of PCA KNN model for 3 neighbors is: 0.5843537414965987


  return self._fit(X, y)


Accuracy of PCA KNN model for 4 neighbors is: 0.5965986394557823


  return self._fit(X, y)


Accuracy of PCA KNN model for 5 neighbors is: 0.5931972789115646


  return self._fit(X, y)


Accuracy of PCA KNN model for 6 neighbors is: 0.5931972789115646


  return self._fit(X, y)


Accuracy of PCA KNN model for 7 neighbors is: 0.5979591836734693


  return self._fit(X, y)


Accuracy of PCA KNN model for 8 neighbors is: 0.5829931972789115
