# **PART I**
# The need is to build a NLP classifier which can use input text parameters to determine the label/s of of the blog.

In [1]:
import os
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
os.chdir('drive/MyDrive')

# 1. Import and analyse the data set

In [5]:
import pandas as pd
df = pd.read_csv('Dataset - blogtext.csv')
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [6]:
df.isna().any()

id        False
gender    False
age       False
topic     False
sign      False
date      False
text      False
dtype: bool

In [7]:
df.shape

(681284, 7)

In [8]:
df = df.head(10000) #as there are 6,81,124 records and its huge for computation which makes colab/jupyter crash, hence we are going to take a subset of the dataset

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10000 non-null  int64 
 1   gender  10000 non-null  object
 2   age     10000 non-null  int64 
 3   topic   10000 non-null  object
 4   sign    10000 non-null  object
 5   date    10000 non-null  object
 6   text    10000 non-null  object
dtypes: int64(2), object(5)
memory usage: 547.0+ KB


# 2. Perform data pre-processing on the data:
* Data cleansing by removing unwanted characters, spaces, stop words etc. Convert text to lowercase.
* Target/label merger and transformation
* Train and test split
* Vectorisation, etc.

In [10]:
df.drop(['id','date'], axis=1, inplace=True) #removing unwanted columns
df.head()

Unnamed: 0,gender,age,topic,sign,text
0,male,15,Student,Leo,"Info has been found (+/- 100 pages,..."
1,male,15,Student,Leo,These are the team members: Drewe...
2,male,15,Student,Leo,In het kader van kernfusie op aarde...
3,male,15,Student,Leo,testing!!! testing!!!
4,male,33,InvestmentBanking,Aquarius,Thanks to Yahoo!'s Toolbar I can ...


In [11]:
import re
df['clean_text']=df['text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x)) #removing unwanted characters

In [12]:
df['clean_text']=df['clean_text'].apply(lambda x: x.strip()) #removing spaces
df['clean_text']=df['clean_text'].apply(lambda x: x.lower()) #converting to lower case

In [13]:
df['clean_text'][7]

'if anything korea is a country of extremes everything here seems fad based i think it may come from korea s history it has been invaded a reported times over the years and each time they got independence i imagine they had to move quickly to get to the next level before the next war or occupation lately well not really lately in the japanese occupation ended then the korean war occurred from after that there was turmoil but in park chung hee took over as dictator president he had elections in which everyone was encouraged to vote but he was still a dictator after his assassination in the next few leaders were basically of the same ilk president park did some amazing things in his time however he took an incredibly backward country and set it on the road to industrialization japan had stripped korea of its resources people and even its language and culture many buildings and palaces were razed and japanese was the official language here from but president park was determined to change 

In [14]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
df['clean_text']=df['clean_text'].apply(lambda x: ' '.join([words for words in x.split() if words not in stopwords])) #removing stop words

In [16]:
df['clean_text'][7]

'anything korea country extremes everything seems fad based think may come korea history invaded reported times years time got independence imagine move quickly get next level next war occupation lately well really lately japanese occupation ended korean war occurred turmoil park chung hee took dictator president elections everyone encouraged vote still dictator assassination next leaders basically ilk president park amazing things time however took incredibly backward country set road industrialization japan stripped korea resources people even language culture many buildings palaces razed japanese official language president park determined change orchestrated han river miracle han river hangang main river seoul korea korea made terrific strides expense civil liberties fastforward present point see korea world wired nation canada finland way beyond u craze pc pc bangs rooms everywhere country well instead playstation like games players go computer one two people korean gamers always 

In [17]:
df['labels']=df.apply(lambda col: [col['gender'],str(col['age']),col['topic'],col['sign']], axis=1) #merging all the other columns

In [18]:
df.head()

Unnamed: 0,gender,age,topic,sign,text,clean_text,labels
0,male,15,Student,Leo,"Info has been found (+/- 100 pages,...",info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,male,15,Student,Leo,These are the team members: Drewe...,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,male,15,Student,Leo,In het kader van kernfusie op aarde...,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,male,15,Student,Leo,testing!!! testing!!!,testing testing,"[male, 15, Student, Leo]"
4,male,33,InvestmentBanking,Aquarius,Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [19]:
df.drop(['gender','age','topic','sign','text'], axis=1, inplace=True)

In [20]:
df.head()

Unnamed: 0,clean_text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [21]:
X = df['clean_text']
y = df['labels']

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True, ngram_range=(1,2))
X = vectorizer.fit_transform(X)

In [23]:
vectorizer.get_feature_names()[:5]

['aa', 'aa amazing', 'aa anger', 'aa compared', 'aa keeps']

In [24]:
label_counts=dict()

for labels in df.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label]+=1
        else:
            label_counts[label]=1

In [25]:
from sklearn.preprocessing import MultiLabelBinarizer
binarizer = MultiLabelBinarizer(classes=sorted(label_counts.keys()))
y = binarizer.fit_transform(df.labels) #pre-processing labels

# 3. Design, train, tune and test the best text classifier.

Here we are using the One-vs-Rest approach, in this approach k classifiers are trained. As a basic classifier we are using LogisticRegression, it is one of the simplest methods but it performs good enough in text classification tasks.

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [27]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

model=LogisticRegression(solver='lbfgs')
model=OneVsRestClassifier(model)

In [28]:
model.fit(X_train,y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [29]:
y_pred=model.predict(X_test)

# 4. Display and explain detail the classification report

In [30]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_scores(y_test, y_pred, type):
    print(type,'average\n')
    print('Accuracy score: ', accuracy_score(y_test, y_pred))
    print('F1 score: ', f1_score(y_test, y_pred, average=type))
    print('Average precision score: ', average_precision_score(y_test, y_pred, average=type))
    print('Average recall score: ', recall_score(y_test, y_pred, average=type))

In [31]:
print_scores(y_test, y_pred, 'micro')

micro average

Accuracy score:  0.3095
F1 score:  0.6345108695652174
Average precision score:  0.4504245665015244
Average recall score:  0.525375


In [32]:
print_scores(y_test, y_pred, 'macro')

macro average

Accuracy score:  0.3095
F1 score:  0.22321097877988885
Average precision score:  nan
Average recall score:  0.1717297932829343


In [33]:
print_scores(y_test, y_pred, 'weighted')

weighted average

Accuracy score:  0.3095
F1 score:  0.5890015468141058
Average precision score:  0.5085305700641577
Average recall score:  0.525375


# 5. Print the true vs predicted labels for any 5 entries from the dataset

In [34]:
pred = y_pred[:10]
actual = y_test[:10]

In [35]:
actual = binarizer.inverse_transform(actual)
actual

[('24', 'Sagittarius', 'female', 'indUnk'),
 ('17', 'Cancer', 'Student', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('34', 'Sagittarius', 'female', 'indUnk'),
 ('25', 'Aries', 'Technology', 'male'),
 ('36', 'Aries', 'Fashion', 'male'),
 ('17', 'Scorpio', 'female', 'indUnk'),
 ('36', 'Aries', 'Fashion', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('37', 'Aquarius', 'female', 'indUnk')]

In [36]:
pred = binarizer.inverse_transform(pred)
pred

[('Aries', 'male'),
 ('Student', 'male'),
 ('female',),
 ('34', 'Sagittarius', 'female', 'indUnk'),
 ('male',),
 ('female', 'indUnk'),
 ('female', 'indUnk'),
 ('36', 'Aries', 'Fashion', 'male'),
 ('male',),
 ('17', 'Scorpio', 'female', 'indUnk')]

# **PART II**
# Design a python based interactive semi - rule based chatbot which can do the following:
1. Start chat session with greetings and ask what the user is looking for.
2. Accept dynamic text based questions from the user. Reply back with relevant answer from the designed corpus.
3. End the chat session only if the user requests to end else ask what the user is looking for. Loop continues till the user asks to end it

In [37]:
pip install tflearn

Collecting tflearn
[?25l  Downloading https://files.pythonhosted.org/packages/e7/3c/0b156d08ef3d4e2a8009ecab2af1ad2e304f6fb99562b6271c68a74a4397/tflearn-0.5.0.tar.gz (107kB)
[K     |███                             | 10kB 18.1MB/s eta 0:00:01[K     |██████                          | 20kB 24.4MB/s eta 0:00:01[K     |█████████▏                      | 30kB 29.6MB/s eta 0:00:01[K     |████████████▏                   | 40kB 33.3MB/s eta 0:00:01[K     |███████████████▎                | 51kB 36.3MB/s eta 0:00:01[K     |██████████████████▎             | 61kB 39.0MB/s eta 0:00:01[K     |█████████████████████▍          | 71kB 37.6MB/s eta 0:00:01[K     |████████████████████████▍       | 81kB 30.3MB/s eta 0:00:01[K     |███████████████████████████▌    | 92kB 28.7MB/s eta 0:00:01[K     |██████████████████████████████▌ | 102kB 30.2MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 30.2MB/s 
Building wheels for collected packages: tflearn
  Building wheel for tflea

In [38]:
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

import tflearn
import tensorflow as tf

import numpy
import random

Instructions for updating:
non-resource variables are not supported in the long term


In [53]:
#importing corpus
import json
with open('GL Bot.json') as file:
  corpus = json.load(file)

#display corpus
print(corpus)

{'intents': [{'tag': 'Intro', 'patterns': ['hi', 'how are you', 'is anyone there', 'hello', 'whats up', 'hey', 'yo', 'listen', 'please help me', 'i am learner from', 'i belong to', 'aiml batch', 'aifl batch', 'i am from', 'my pm is', 'blended', 'online', 'i am from', 'hey ya', 'talking to you for first time'], 'responses': ['Hello! how can i help you ?'], 'context_set': ''}, {'tag': 'Exit', 'patterns': ['thank you', 'thanks', 'cya', 'see you', 'later', 'see you later', 'goodbye', 'i am leaving', 'have a Good day', 'you helped me', 'thanks a lot', 'thanks a ton', 'you are the best', 'great help', 'too good', 'you are a good learning buddy'], 'responses': ['I hope I was able to assist you, Good Bye'], 'context_set': ''}, {'tag': 'Olympus', 'patterns': ['olympus', 'explain me how olympus works', 'I am not able to understand olympus', 'olympus window not working', 'no access to olympus', 'unable to see link in olympus', 'no link visible on olympus', 'whom to contact for olympus', 'lot of p

In [40]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [41]:
#extract data

W = []
L = []
doc_x = []
doc_y = []

for intent in corpus['intents']:
  for pattern in intent['patterns']:
    w_temp = nltk.word_tokenize(pattern)
    W.extend(w_temp)
    doc_x.append(w_temp)
    doc_y.append(intent['tag'])

  if intent['tag'] not in L:
    L.append(intent['tag'])

In [42]:
#stemming

W = [stemmer.stem(w.lower()) for w in W if w != "?"]
W = sorted(list(set(W)))
L = sorted(L)

#words
W

['a',
 'abl',
 'access',
 'act',
 'ad',
 'adam',
 'aifl',
 'aiml',
 'am',
 'an',
 'anyon',
 'ar',
 'art',
 'backward',
 'bad',
 'bag',
 'batch',
 'bay',
 'belong',
 'best',
 'blend',
 'bloody',
 'boost',
 'bot',
 'buddy',
 'class',
 'contact',
 'cre',
 'cross',
 'cya',
 'day',
 'deep',
 'did',
 'diffult',
 'do',
 'ensembl',
 'epoch',
 'explain',
 'first',
 'for',
 'forest',
 'forward',
 'from',
 'funct',
 'good',
 'goodby',
 'grady',
 'gre',
 'hat',
 'hav',
 'hel',
 'hello',
 'help',
 'hey',
 'hi',
 'hid',
 'hour',
 'how',
 'hyp',
 'i',
 'imput',
 'in',
 'intellig',
 'is',
 'jerk',
 'jok',
 'knn',
 'lat',
 'lay',
 'learn',
 'leav',
 'link',
 'list',
 'log',
 'lot',
 'machin',
 'me',
 'ml',
 'my',
 'naiv',
 'nam',
 'nb',
 'net',
 'network',
 'neur',
 'no',
 'not',
 'of',
 'olymp',
 'olyp',
 'on',
 'onlin',
 'op',
 'opert',
 'otim',
 'paramet',
 'piec',
 'pleas',
 'pm',
 'problem',
 'prop',
 'random',
 'regress',
 'relu',
 'screw',
 'see',
 'sgd',
 'shit',
 'sigmoid',
 'sl',
 'smart',
 '

In [43]:
#tags
L

['Bot', 'Exit', 'Intro', 'NN', 'Olympus', 'Profane', 'SL', 'Ticket']

In [44]:
Train = []
Target = []

out_empty = [0 for _ in range(len(L))]

#loop to create bag of words and put the frequency count on each word
for x, doc in  enumerate(doc_x):
  bag = []
  w_temp = [stemmer.stem(w.lower()) for w in doc]
  for w in W:
    if w in w_temp:
      bag.append(1)
    else:
      bag.append(0)
    output_row = out_empty[:]
    output_row[L.index(doc_y[x])] = 1

    Train.append(bag)
    Target.append(output_row)

In [45]:
#converting to numpy arrays

Train = numpy.array(Train)
Target = numpy.array(Target)

In [46]:
from tensorflow.python.framework import ops
ops.reset_default_graph()

FCNN = tflearn.input_data(shape=[None, len(Train[0])]) #input layer
FCNN = tflearn.fully_connected(FCNN, 8)
FCNN = tflearn.fully_connected(FCNN, 8)
FCNN = tflearn.fully_connected(FCNN, len(Target[0]), activation='softmax') #output layer with size equal to number of tags

FCNN = tflearn.regression(FCNN)
model = tflearn.DNN(FCNN) #fully connected neural network model

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [47]:
model.fit(Train, Target, n_epoch=100, batch_size=8, show_metric=True)
model.save('model.tflearn')

Training Step: 239999  | total loss: [1m[32m0.00000[0m[0m | time: 14.788s
| Adam | epoch: 100 | loss: 0.00000 - acc: 1.0000 -- iter: 19192/19200
Training Step: 240000  | total loss: [1m[32m0.00000[0m[0m | time: 14.792s
| Adam | epoch: 100 | loss: 0.00000 - acc: 1.0000 -- iter: 19200/19200
--
INFO:tensorflow:/content/drive/My Drive/model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


In [57]:
def bag_of_words(s, W):
  bag = [0 for _ in range(len(W))]

  s_words = nltk.word_tokenize((s))
  s_words = [stemmer.stem(word.lower()) for word in s_words]

  for se in s_words:
    for i,w in enumerate(W):
      if w == se:
        bag[i] = 1
  return numpy.array(bag)

def chat():
  print("Chat with GL Bot (type: stop to quit)")
  print("If answer is not right (type: *)")
  while True:
    inp = input("\n\nYou: ")
    if inp.lower()=="*":
      print("BOT: Please rephrase your question and try again")
    if inp.lower() == "quit":
      break
    results = model.predict([bag_of_words(inp, W)])
    results_index = numpy.argmax(results)
    tag = L[results_index]

    for tg in corpus["intents"]:
      if tg['tag'] == tag:
        responses = tg['responses']
  
    print(random.choice(responses))

In [64]:
chat()

Chat with GL Bot (type: stop to quit)
If answer is not right (type: *)


You: hi
Hello! how can i help you ?


You: who are you
I am your virtual learning assistant


You: i need CNN class material
Link: Machine Learning wiki 


You: create a ticket
Tarnsferring the request to your PM


You: useless bot
Tarnsferring the request to your PM


You: bad
Tarnsferring the request to your PM


You: goodbye
I hope I was able to assist you, Good Bye


You: quit
