In [4]:
#!pip install beautifulsoup4
#!pip install --upgrade nltk
!pip install torch

Collecting torch
  Using cached torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting filelock (from torch)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (

In [12]:
import numpy as np
import pandas as pd
import nltk
import string
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from html import unescape

np.random.seed(7)

train_on_gpu = torch.cuda.is_available()
if train_on_gpu:
    print('Training on GPU!')
else:
    print('No GPU available, training on CPU; consider making n_epochs very small.')

questions_df = pd.read_csv('dataset/Questions.csv', nrows=5000, encoding='ISO-8859-1')
answers_df = pd.read_csv('dataset/Answers.csv', nrows=5000, encoding='ISO-8859-1')
tags_df = pd.read_csv('dataset/Tags.csv', nrows=5000, encoding='ISO-8859-1')

Training on GPU!


In [13]:
questions_df.head(10)

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...
5,742,189.0,2008-08-03T15:55:28Z,30,Class views in Django,"<p><a href=""http://www.djangoproject.com/"">Dja..."
6,766,1384652.0,2008-08-03T17:44:07Z,20,Python and MySQL,<p>I can get Python to work with Postgresql bu...
7,773,207.0,2008-08-03T18:27:09Z,256,How do I use Python's itertools.groupby()?,<p>I haven't been able to find an understandab...
8,972,145.0,2008-08-04T02:17:51Z,364,Adding a Method to an Existing Object Instance,<p>I've read that it is possible to add a meth...
9,1476,92.0,2008-08-04T18:20:36Z,251,How do you express binary literals in Python?,<p>How do you express an integer as a binary n...


In [14]:
answers_df.head(10)

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,497,50.0,2008-08-02T16:56:53Z,469,4,<p>open up a terminal (Applications-&gt;Utilit...
1,518,153.0,2008-08-02T17:42:28Z,469,2,<p>I haven't been able to find anything that d...
2,536,161.0,2008-08-02T18:49:07Z,502,9,<p>You can use ImageMagick's convert utility f...
3,538,156.0,2008-08-02T18:56:56Z,535,23,<p>One possibility is Hudson. It's written in...
4,541,157.0,2008-08-02T19:06:40Z,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B..."
5,595,116.0,2008-08-03T01:17:36Z,594,25,<p>The canonical way is to use the built-in cu...
6,660,197.0,2008-08-03T12:09:18Z,535,14,<p>Second the Buildbot - Trac integration. You...
7,701,111.0,2008-08-03T14:30:50Z,683,3,"<p>No, you were not dreaming. Python has a pr..."
8,735,145.0,2008-08-03T15:47:22Z,683,-2,<p>I think:</p>\r\n\r\n<pre><code>#!/bin/pytho...
9,745,154.0,2008-08-03T15:59:19Z,683,8,<p>Are you looking to get a list of objects th...


In [15]:
tags_df.head(10)

Unnamed: 0,Id,Tag
0,469,python
1,469,osx
2,469,fonts
3,469,photoshop
4,502,python
5,502,windows
6,502,image
7,502,pdf
8,535,python
9,535,continuous-integration


In [16]:
stemmer = PorterStemmer()

def preprocess_text(text):
    if pd.isnull(text):
        return ''
    text = text.lower()
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    tokens = text.split()
    return ' '.join([stemmer.stem(token) for token in tokens])

questions_df['CleanedQuestion'] = questions_df['Body'].apply(preprocess_text)
answers_df['CleanedAnswer'] = answers_df['Body'].apply(preprocess_text)

allwords = ' '.join(questions_df['CleanedQuestion']).split()
uniquewords = list(set(allwords))

tags_list = list(set(tags_df['Tag']))
num_classes = len(tags_list)

def get_categories_for_question(question_id):
    return tags_df[tags_df['Id'] == question_id]['Tag'].tolist()

questions_df['Categories'] = questions_df['Id'].apply(get_categories_for_question)

y_train = []
for tags in questions_df['Categories']:
    one_hot = [0] * num_classes
    for tag in tags:
        if tag in tags_list:
            one_hot[tags_list.index(tag)] = 1
    y_train.append(one_hot)

def makeTextIntoNumbers(text):
    words = text.lower().split(' ')
    numbers = []
    for word in words:
        try:
            numbers.append(uniquewords.index(word))
        except ValueError:
            numbers.append(0)
    numbers = numbers + [0, 0, 0, 0, 0]
    return numbers[:6]

x_train = []
for text in questions_df['CleanedQuestion']:
    indices = makeTextIntoNumbers(text)
    x_train.append(indices)

x_train = torch.LongTensor(x_train)
y_train = torch.Tensor(y_train)


Model Net(
  (embedding): Embedding(46371, 20)
  (lstm): LSTM(20, 16, batch_first=True)
  (fc1): Linear(in_features=16, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=1154, bias=True)
)
Parameters [927420, 1280, 1024, 64, 64, 4096, 256, 295424, 1154]
Step 0, Loss: 0.2501976490020752
Step 25, Loss: 0.2501925528049469
Step 50, Loss: 0.2501874566078186
Step 75, Loss: 0.2501823902130127
Step 100, Loss: 0.250177264213562
Step 125, Loss: 0.2501721978187561
Step 150, Loss: 0.2501671314239502
Step 175, Loss: 0.2501620352268219
Step 200, Loss: 0.250156968832016
Step 225, Loss: 0.2501518726348877
Step 250, Loss: 0.2501468062400818
Step 275, Loss: 0.2501417100429535
Step 300, Loss: 0.2501366436481476
Step 325, Loss: 0.2501315772533417
Step 350, Loss: 0.2501264810562134
Step 375, Loss: 0.25012141466140747
Step 400, Loss: 0.25011634826660156
Step 425, Loss: 0.25011125206947327
Step 450, Loss: 0.25010615587234497
Step 475, Loss: 0.2501010596752167
Step 500, Loss: 0.250096

Human:  what is python


Predicted Categories: ['dump', 'delete-directory', 'build-automation']
Chatbot: Updated to only delete files and to used the os.path.join() method suggested in the comments. If you also want to remove subdirectories, uncomment the elif statement.
import os, shutil
folder = '/path/to/folder'
for the_file in os.listdir(folder):
    file_path = os.path.join(folder, the_file)
    try:
        if os.path.isfile(file_path):
            os.unlink(file_path)
        #elif os.path.isdir(file_path): shutil.rmtree(file_path)
    except Exception as e:
        print(e)




Human:  where can I find my windows key on the keyboard


Predicted Categories: ['osx-leopard', 'porting', 'ora-00932']
Chatbot: Instead of uninstalling the built-in Python, install the MacPorts version and then modify your $PATH to have the MacPorts version first.
For example, if MacPorts installs /usr/local/bin/python, then modify your .bashrc to include PATH=/usr/local/bin:$PATH at the end.



KeyboardInterrupt: Interrupted by user

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(len(uniquewords), 20)
        self.lstm = nn.LSTM(input_size=20, hidden_size=16, num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(16, 256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, inp):
        e = self.embedding(inp)
        output, _ = self.lstm(e)
        x = self.fc1(output[:, -1, :])
        x = F.relu(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        return x

n = Net()
print("Model", n)
print("Parameters", [param.nelement() for param in n.parameters()])

optimizer = torch.optim.SGD(n.parameters(), lr=0.01)
loss_fn = torch.nn.MSELoss()

n_steps = 10000
for i in range(n_steps):
    y_pred_train = n(x_train)
    loss_train = loss_fn(y_pred_train, y_train)
    optimizer.zero_grad()
    loss_train.backward()
    optimizer.step()
    if (i % 25) == 0:
        print(f"Step {i}, Loss: {loss_train.item()}")

def classify(line):
    indices = makeTextIntoNumbers(line)
    tensor = torch.LongTensor([indices])
    output = n(tensor).detach().numpy()
    predicted_tags = np.argsort(output[0])[::-1]
    return predicted_tags

def clean_html(raw_html):
    soup = BeautifulSoup(raw_html, 'html.parser')
    text = soup.get_text()
    text = unescape(text)
    return text

def get_random_answer_by_category(predicted_tags):
    for aindex in predicted_tags:
        tag = tags_list[aindex]
        relevant_questions = questions_df[questions_df['Categories'].apply(lambda tags: tag in tags)]
        if not relevant_questions.empty:
            rand_index = random.choice(relevant_questions.index)
            question_id = relevant_questions.loc[rand_index, 'Id']
            answers = answers_df[answers_df['ParentId'] == question_id]['Body'].values
            if len(answers) > 0:
                answer = random.choice(answers)
                clean_answer = clean_html(answer)
                return clean_answer
    return "No matching question found."

print("Chatbot ready")
user_input = input("Human: ")
while user_input:
    predicted_tags = classify(user_input)
    print(f"Predicted Categories: {[tags_list[i] for i in predicted_tags[:3]]}")
    answer = get_random_answer_by_category(predicted_tags)
    print(f"Chatbot: {answer}")
    user_input = input("Human: ")

Model Net(
  (embedding): Embedding(46371, 20)
  (lstm): LSTM(20, 16, batch_first=True)
  (fc1): Linear(in_features=16, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=1154, bias=True)
)
Parameters [927420, 1280, 1024, 64, 64, 4096, 256, 295424, 1154]
Step 0, Loss: 0.2504802942276001
Step 25, Loss: 0.25042811036109924
Step 50, Loss: 0.2503758668899536
Step 75, Loss: 0.25032368302345276
Step 100, Loss: 0.2502714991569519
Step 125, Loss: 0.2502193748950958
Step 150, Loss: 0.25016725063323975
Step 175, Loss: 0.25011515617370605
Step 200, Loss: 0.25006306171417236
Step 225, Loss: 0.25001099705696106
Step 250, Loss: 0.24995894730091095
Step 275, Loss: 0.24990692734718323
Step 300, Loss: 0.2498549371957779
Step 325, Loss: 0.24980296194553375
Step 350, Loss: 0.24975097179412842
Step 375, Loss: 0.24969899654388428
Step 400, Loss: 0.2496470957994461
Step 425, Loss: 0.24959515035152435
Step 450, Loss: 0.24954326450824738
Step 475, Loss: 0.2494913935661316
Step 500, Los

Human:  what is the purpose of photoshop


Predicted Categories: ['buildbot', 'file', 'instance']
Chatbot: We've had great success with TeamCity as our CI server and using nose as our test runner.  Teamcity plugin for nosetests gives you count pass/fail, readable display for failed test( that can be E-Mailed).  You can even see details of the test failures while you stack is running.  
If of course supports things like running on multiple machines, and it's much simpler to setup and maintain than buildbot.



Human:  is python the best language for machine learning


Predicted Categories: ['buildbot', 'wrap', 'instance']
Chatbot: We've had great success with TeamCity as our CI server and using nose as our test runner.  Teamcity plugin for nosetests gives you count pass/fail, readable display for failed test( that can be E-Mailed).  You can even see details of the test failures while you stack is running.  
If of course supports things like running on multiple machines, and it's much simpler to setup and maintain than buildbot.



Human:  how can I learn binary calculations?


Predicted Categories: ['yield', 'file', 'iis-modules']
Chatbot: Shortcut to Grokking yield
When you see a function with yield statements, apply this easy trick to understand what will happen:

Insert a line result = [] at the start of the function.
Replace each yield expr with result.append(expr).
Insert a line return result at the bottom of the function.
Yay - no more yield statements! Read and figure out code.
Compare function to original definition.

This trick may give you an idea of the logic behind the function, but what actually happens with yield is significantly different that what happens in the list based approach. In many cases the yield approach will be a lot more memory efficient and faster too. In other cases this trick will get you stuck in an infinite loop, even though the original function works just fine. Read on to learn more...
Don't confuse your Iterables, Iterators and Generators
First, the iterator protocol - when you write
for x in mylist:
    ...loop body...

