**PART 1 - Digital content management**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/blogtext.csv')

In [5]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [6]:
#using just 2000 samples as system kept crashing while training the classifier due to RAM issues
df = df.head(2000)

In [7]:
# convert to lowercase
df.text = df.text.apply(lambda x: x.lower())

In [8]:
# remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
df.text = df.text.apply(lambda x: ' '.join([text for text in x.split() if text not in stopwords]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# create lable column according to problem statement
df['labels'] = df.apply(lambda row: [row['gender'], str(row['age']), row['topic'], row['sign']], axis=1)

In [10]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,labels
0,2059027,male,15,Student,Leo,"14,May,2004","info found (+/- 100 pages, 4.5 mb .pdf files) ...","[male, 15, Student, Leo]"
1,2059027,male,15,Student,Leo,"13,May,2004",team members: drewes van der laag urllink mail...,"[male, 15, Student, Leo]"
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde: maak je eige...,"[male, 15, Student, Leo]"
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,"[male, 15, Student, Leo]"
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoo!'s toolbar 'capture' urls popups....,"[male, 33, InvestmentBanking, Aquarius]"


In [11]:
df = df[['text','labels']]

In [12]:
df.head()

Unnamed: 0,text,labels
0,"info found (+/- 100 pages, 4.5 mb .pdf files) ...","[male, 15, Student, Leo]"
1,team members: drewes van der laag urllink mail...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde: maak je eige...,"[male, 15, Student, Leo]"
3,testing!!! testing!!!,"[male, 15, Student, Leo]"
4,thanks yahoo!'s toolbar 'capture' urls popups....,"[male, 33, InvestmentBanking, Aquarius]"


In [13]:
#split into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.text.values, df.labels.values, test_size=0.20, random_state=42)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
X_train_words = vectorizer.fit_transform(X_train)
X_test_words = vectorizer.transform(X_test)

In [15]:
X_train_words.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [16]:
label_counts = dict()

for labels in df.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1

In [17]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=sorted(label_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [18]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

In [19]:
clf.fit(X_train_words, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [20]:
predicted_labels = clf.predict(X_test_words)
predicted_scores = clf.decision_function(X_test_words)

In [21]:
predicted_labels

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0]])

In [22]:
#inverse transform
pred_inversed = mlb.inverse_transform(predicted_labels)
y_test_inversed = mlb.inverse_transform(y_test)

In [23]:
pred_inversed

[('35', 'Aries', 'Technology', 'male'),
 ('male',),
 ('15', 'female', 'indUnk'),
 ('male',),
 ('male',),
 ('35', 'Aries', 'Technology', 'male'),
 ('male',),
 ('male',),
 ('Aquarius', 'InvestmentBanking', 'female'),
 ('female',),
 ('33', 'Aquarius', 'InvestmentBanking', 'male'),
 ('male',),
 ('15', 'Libra', 'Student', 'female'),
 ('24', 'male'),
 ('female',),
 ('female', 'indUnk'),
 ('male',),
 ('Aries', 'female'),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('male',),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('15', 'Libra', 'Student', 'female'),
 ('male',),
 ('male',),
 ('female', 'indUnk'),
 ('male',),
 ('male',),
 ('male',),
 ('male',),
 ('male',),
 ('male',),
 ('35', 'Aries', 'Technology', 'male'),
 ('male',),
 ('male',),
 ('Aries', 'female'),
 ('male',),
 ('15', 'Libra', 'Student', 'female'),
 ('male',),
 ('Student', 'female'),
 ('male',),
 ('male',),
 ('Aries', 'male'),
 ('35', 'Aries', 'Technology', 'mal

**PART 2 - Customer Support**

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('popular', quiet=True)

True

In [27]:
#read corpus
import json

with open('/content/drive/My Drive/GL Bot.json') as file:
    Corpus = json.load(file)

print(Corpus)

{'intents': [{'tag': 'Intro', 'patterns': ['hi', 'how are you', 'is anyone there', 'hello', 'whats up', 'hey', 'yo', 'listen', 'please help me', 'i am learner from', 'i belong to', 'aiml batch', 'aifl batch', 'i am from', 'my pm is', 'blended', 'online', 'i am from', 'hey ya', 'talking to you for first time'], 'responses': ['Hello! how can i help you ?'], 'context_set': ''}, {'tag': 'Exit', 'patterns': ['thank you', 'thanks', 'cya', 'see you', 'later', 'see you later', 'goodbye', 'i am leaving', 'have a Good day', 'you helped me', 'thanks a lot', 'thanks a ton', 'you are the best', 'great help', 'too good', 'you are a good learning buddy'], 'responses': ['I hope I was able to assist you, Good Bye'], 'context_set': ''}, {'tag': 'Olympus', 'patterns': ['olympus', 'explain me how olympus works', 'I am not able to understand olympus', 'olympus window not working', 'no access to olympus', 'unable to see link in olympus', 'no link visible on olympus', 'whom to contact for olympus', 'lot of p

In [51]:
# tokenize 
tokens = [] 
labels = [] 
token_words = [] 
token_labels = [] 

for intent in Corpus['intents']:
    for pattern in intent['patterns']:
        w_temp = nltk.word_tokenize(pattern)
        tokens.extend(w_temp)
        token_words.append(w_temp)
        token_labels.append(intent["tag"])
    
    # Add the mising tag if any    
    if intent['tag'] not in labels:
        labels.append(intent['tag'])

In [52]:
labels

['Intro', 'Exit', 'Olympus', 'SL', 'NN', 'Bot', 'Profane', 'Ticket']

In [53]:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer() 
Train = [] 
Target = []

out_empty = [0 for i in range(len(labels))]

for x, doc in enumerate(token_words):
    bag = []

    w_temp = [stemmer.stem(w.lower()) for w in doc]

    for w in tokens:
        if w in w_temp:
            bag.append(1)
        else:
            bag.append(0)

    output_row = out_empty[:]
    output_row[labels.index(token_labels[x])] = 1

    Train.append(bag) # List
    Target.append(output_row) # List

In [56]:
import numpy as np
Train = np.array(Train)
Target = np.array(Target)

In [57]:
X_train_part2, X_test_part2, y_train_part2, y_test_part2 = train_test_split(Train, Target, test_size=0.25, random_state=10)

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model_RF = RandomForestClassifier(n_estimators=50,
                                    criterion='gini',
                                    max_depth=None,
                                    min_samples_split=2,
                                    min_samples_leaf=1,
                                    min_weight_fraction_leaf=0.0,
                                    max_features='auto',
                                    max_leaf_nodes=None,
                                    min_impurity_decrease=0.0,
                                    min_impurity_split=None,
                                    bootstrap=True,
                                    oob_score=False,
                                    n_jobs=None,
                                    random_state=None,
                                    verbose=0,
                                    ccp_alpha=0.0)
model_RF.fit(X_train_part2, y_train_part2)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [59]:
def bag_of_words(s, words):
    bag = [0 for i in range(len(words))]

    s_words = nltk.word_tokenize(s)
    s_words = [stemmer.stem(word.lower()) for word in s_words]

    for se in s_words:
        for i, w in enumerate(words):
            if w == se:
                bag[i] = 1
            
    return np.array(bag)

In [62]:
def chat():
    print("type: bye to quit")
    while True:
        inp = input("\n\nYou: ")
        if inp.lower()=="*":
            print("BOT: Sorry! I don't understand you.")
        if inp.lower() == "bye":
            break

        results = model_RF.predict([bag_of_words(inp, tokens)])
        results_index = numpy.argmax(results)
        tag = labels[results_index]

        for tg in Corpus["intents"]:
            if tg['tag'] == tag:
                responses = tg['responses']

        print(random.choice(responses))

In [63]:
chat()

type: bye to quit


You: hey
Hello! how can i help you ?


You: olympus
Link: Machine Learning wiki 


You: greta learning
Link: Machine Learning wiki 


You: bye
