# Data processing

In [119]:
# max_words = 25

In [120]:
import json

with open('intents.json', 'r') as f:
    intents = json.load(f)['intents']

X = []
y = []
tag_to_response = {}
classes = []
for intent in intents:
    tag=intent['tag']
    if tag not in classes:
        classes.append(tag)
    tag_to_response[tag] = intent['responses']
    for pattern in intent['patterns']:
        # tokens = tokenizer(pattern)
        # if len(tokens) < max_words:
        #     tokens += (max_words - len(tokens))*[""]
        # else:
        #     tokens = tokens[:max_words]
        X.append(pattern)
        y.append(classes.index(tag))
X[:5]

['Hi', 'Hey', 'How are you', 'Is anyone there?', 'Hello']

In [122]:
import spacy
import numpy as np
# English pipelines include a rule-based lemmatizer
nlp = spacy.load("en_core_web_lg", exclude=["ner","parser"])
docs = nlp.pipe(X)

included_pos_tags = ['ADJ','ADV','INTJ','NOUN', 'PROPN', 'VERB']
excluded_pos_tags = ['PUNCT', 'SYM', 'X', 'AUX']
processed_X = []
for doc in docs:
    token_vectors = [nlp(token.lemma_)[0].vector for token in doc if token.pos_ not in excluded_pos_tags]
    # if len(token_vectors) < max_words:
    #     token_vectors += [nlp(" ")[0].vector]*max_words
    # else:
    #     token_vectors = token_vectors[:max_words]
    
    processed_X.append(np.average(token_vectors, axis=0))

processed_X[0]

array([ 4.3619e+00,  6.4227e-01,  3.6572e+00,  1.2839e-05, -4.9597e+00,
       -3.7551e+00, -7.2010e-01, -3.8699e+00, -6.4149e+00, -2.1970e+00,
       -1.4052e+00,  2.0386e+00, -6.1365e+00,  1.4983e+00,  7.4897e-02,
       -2.8394e+00, -6.8990e-01,  1.0237e+00, -2.3867e+00, -2.4802e+00,
        4.8034e+00,  4.0907e-01, -5.9518e-01, -9.6808e+00,  1.0078e+00,
       -2.6209e+00,  1.6237e+00, -1.2758e+00, -3.3642e+00,  2.5376e+00,
       -8.2612e+00, -2.0550e+00, -3.5853e+00,  2.5571e+00,  3.4179e+00,
        4.8266e+00, -4.7334e+00,  1.4521e+00,  1.9508e+00, -3.5734e+00,
        7.1136e-01, -4.0711e+00, -2.3756e+00,  2.9331e+00, -1.0385e+00,
        8.0737e-01, -1.0357e+01,  1.0074e+00, -2.3647e+00,  4.6682e+00,
       -5.6539e+00, -8.9656e+00, -4.4444e-02,  4.0555e-01,  5.9755e+00,
       -2.6969e+00,  3.1529e+00,  1.7365e+00,  8.0275e-01,  2.4104e+00,
        2.2166e+00,  4.9589e+00, -2.3379e+00, -5.4958e+00,  1.0721e-01,
       -1.8430e+00, -3.3400e+00,  5.7293e+00, -2.7006e+00,  1.65

# Train

In [123]:
batch_size = 16
from torch.utils.data import Dataset, DataLoader
class ChatDataset(Dataset):

    def __init__(self, X, y):
        self.n_samples = len(X)
        self.x_data = X
        self.y_data = y

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples
    
dataset = ChatDataset(processed_X,y)
train_loader = DataLoader(dataset=dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=1)

In [124]:
learning_rate = 0.001
num_epochs = 100
embed_len = 300

import torch
from torch import nn
from model import EmbeddingClassifier

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = EmbeddingClassifier(max_words=1, embed_len=embed_len, num_classes=len(classes)).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    for (words, labels) in train_loader:
        words = words.to(device)
        labels = labels.to(dtype=torch.long).to(device)
        # Forward pass
        outputs = model(words)
        # if y would be one-hot, we must apply
        # labels = torch.max(labels, 1)[1]
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch+1) % 10 == 0:
        print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


print(f'final loss: {loss.item():.4f}')

Epoch [10/100], Loss: 0.1067
Epoch [20/100], Loss: 0.0045
Epoch [30/100], Loss: 0.0021
Epoch [40/100], Loss: 0.0014
Epoch [50/100], Loss: 0.0011
Epoch [60/100], Loss: 0.0011
Epoch [70/100], Loss: 0.0007
Epoch [80/100], Loss: 0.0006
Epoch [90/100], Loss: 0.0005
Epoch [100/100], Loss: 0.0004
final loss: 0.0004


In [128]:
data = {
"model_state": model.state_dict(),
"classes": classes,
"lr": learning_rate,
"max_words": 1,
"embed_length": embed_len,
'tag_to_response': tag_to_response
}

FILE = "data.pth"
torch.save(data, FILE)

print(f'training complete. file saved to {FILE}')

training complete. file saved to data.pth


# Database

In [126]:
import pandas as pd
df = pd.read_csv('Coursera.csv')
df['Course Description'].iloc[0]

'Write a Full Length Feature Film Script  In this course, you will write a complete, feature-length screenplay for film or television, be it a serious drama or romantic comedy or anything in between. You�ll learn to break down the creative process into components, and you�ll discover a structured process that allows you to produce a polished and pitch-ready script by the end of the course. Completing this project will increase your confidence in your ideas and abilities, and you�ll feel prepared to pitch your first script and get started on your next. This is a course designed to tap into your creativity and is based in "Active Learning". Most of the actual learning takes place within your own activities - that is, writing! You will learn by doing.  Here is a link to a TRAILER for the course. To view the trailer, please copy and paste the link into your browser. https://vimeo.com/382067900/b78b800dc0  Learner review: "Love the approach Professor Wheeler takes towards this course. It\'s

In [131]:
keywords = ['data', 'science']

# Count the number of occurrences of each keyword in each row
counts = df['Course Description'].str.count('|'.join(keywords))

# Sort the DataFrame in descending order by the number of occurrences
sorted_df = df.iloc[(-counts).argsort()]

# Select the top N rows
top_rows = sorted_df.head(10)

# Print the top N rows
top_rows

Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
2701,Process Mining: Data science in Action,Eindhoven University of Technology,Beginner,4.8,https://www.coursera.org/learn/process-mining,Process mining is the missing link between mod...,process modeling business process Process Mi...
205,Data-driven Astronomy,The University of Sydney,Advanced,4.8,https://www.coursera.org/learn/data-driven-ast...,"Science is undergoing a data explosion, and as...",Computer Programming Python Programming SQL ...
2355,A Crash Course in Data Science,Johns Hopkins University,Conversant,4.4,https://www.coursera.org/learn/data-science-co...,By now you have definitely heard about data sc...,analysis Machine Learning software Human Le...
1204,Introduction to Data Analytics,IBM,Advanced,4.7,https://www.coursera.org/learn/introduction-to...,This course presents a gentle introduction int...,analytics Data Analysis physics Exploratory...
329,Building a Data Science Team,Johns Hopkins University,Intermediate,4.5,https://www.coursera.org/learn/build-data-scie...,Data science is a team sport. As a data scienc...,Data Analysis team management Team Building ...
2798,Using clinical health data for better healthcare,The University of Sydney,Intermediate,4.6,https://www.coursera.org/learn/healthcare-data,Digital health is rapidly being realised as th...,data integrity health human resources Health...
2710,Big Data Modeling and Management Systems,University of California San Diego,Intermediate,4.4,https://www.coursera.org/learn/big-data-manage...,Once you�ve identified a big data issue to ana...,Databases analytics graphs Data Structures ...
2804,Fundamentals of Scalable Data Science,IBM,Advanced,4.1,https://www.coursera.org/learn/ds,Apache Spark is the de-facto standard for larg...,General Statistics Dimensionality Reduction ...
1580,SQL for Data Science,"University of California, Davis",Conversant,4.5,https://www.coursera.org/learn/sql-for-data-sc...,As data collection has increased exponentially...,modeling Databases data retrieval analysis ...
1148,Excel Basics for Data Analysis,IBM,Advanced,4.7,https://www.coursera.org/learn/excel-basics-da...,This course is designed to provide you with ba...,euler's totient function Pivot Table workshe...


In [141]:
nlp = spacy.load("en_core_web_lg")
doc = nlp("I wanting to learn data science")
keyword_pos_tags = ['NOUN', 'PROPN', 'VERB']
for token in doc:
    if token.pos_ in keyword_pos_tags:
        print(token.lemma_, token.pos_)

want VERB
learn VERB
data NOUN
science NOUN
