In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import nltk
from nltk.corpus import stopwords
import string
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
import torch.nn.functional as F

In [2]:
file = open('cooking.stackexchange.txt', 'r')
# read all text
text = file.readlines()
# close the file
file.close()

In [3]:
text[0]

'__label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe?\n'

In [4]:
text[0].split("__label__")# here we can observe that first item in list is always going to be empty string and one of the labels is attached to the sentence.

['',
 'sauce ',
 'cheese How much does potato starch affect a cheese sauce recipe?\n']

**Text-Preprocessing**

In [5]:
sentences=[]#to store all sentences in the corpus
all_label=set()#to store all the unique labels
labels=[]# to store the corresponding label to a sentence 

for i in tqdm(text):#separating labels and sentences of each line
    p=i.strip().split("__label__")
    p.pop(0)
    k=p.pop().split(" ")#k contains sentence
    p.append(k.pop(0))#p contains all the labels 
    
    for i in range(len(p)):
        p[i]=p[i].strip()
        all_label.add(p[i])
    sentences.append(''.join([i.lower()+" " for i in k]))
    labels.append(p)
all_label=list(all_label)

100%|██████████| 15404/15404 [00:00<00:00, 123357.17it/s]


In [6]:
def text_process(mess):# pre processing the sentences
    
    nopunc = [char for char in mess if char not in string.punctuation]#removing punctuations
    nopunc = ''.join(nopunc)
    
    #removing stopwords and returning the list of words in a sentence without punctuation and stopwords.
    return [word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [7]:
for i in tqdm(range(len(sentences))):# preprocessing all sentences in the corpus
    sentences[i]=(text_process(sentences[i]))

100%|██████████| 15404/15404 [00:27<00:00, 562.72it/s]


In [8]:
#Creating a list of unique words in all sentences
total_words=set()
for sent in sentences:
    for t in sent:
        total_words.add(t)
print(len(total_words))
total_words=list(total_words)

9469


In [9]:
#creating bag of words vectors for all sentences
BOW=[]
for sent in tqdm(sentences):
    sentence_bow_vector=[]
    for t in total_words:
        if t in sent:
            sentence_bow_vector.append(sent.count(t))
        else:
            sentence_bow_vector.append(0)
    BOW.append(sentence_bow_vector)
BOW=np.array(BOW)

100%|██████████| 15404/15404 [00:46<00:00, 330.78it/s]


In [10]:
BOW.shape

(15404, 9469)

In [11]:
#converting the labels into onehot encoded vectors
label_index={j:i for i,j in enumerate(all_label)}
label_onehot=np.zeros([len(labels),736])
for i in tqdm(range(len(labels))):
    for k in labels[i]:
        label_onehot[i][label_index[k]]=1

100%|██████████| 15404/15404 [00:00<00:00, 116009.92it/s]


In [12]:
label_onehot.shape

(15404, 736)

In [13]:
#normalizing BOW vectors
BOW=preprocessing.normalize(BOW)
BOW=BOW.astype(np.float32)

In [14]:
#applying principal component analysis for dimensionality reduction
pca=PCA(n_components=500)
pca.fit(BOW)
BOW=pca.transform(BOW)

In [15]:
BOW.shape

(15404, 500)

**Linear Regression model**

In [16]:
#splitting into testing and training datasets
sent_train,sent_test,label_train,label_test=train_test_split(BOW,label_onehot,test_size=0.2)

In [17]:
#creating a linear regression model on the training dataset
from sklearn.linear_model import LinearRegression

LR_model=LinearRegression()
LR_model.fit(sent_train,label_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
predictions=LR_model.predict(sent_test)#getting predictions given by the model on testing dataset

In [19]:
#calculating accuracy of the model
correct=0
for i in range(len(predictions)):
    k=np.argmax(predictions[i])
    if label_test[i][k]==1:
        correct+=1
print(100*correct/len(predictions))

64.32976306394028


**Deep Learning model using pytorch**

In [20]:
class customdataloader(torch.utils.data.Dataset):
    def __init__(self,sent,lab):
        self.sent=sent
        self.lab=lab
        
    def __len__(self):
        return len(self.lab)
    def __getitem__(self,idx):
        label=self.lab[idx]
        sentence=self.sent[idx]
        
        return sentence,label

In [21]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net,self).__init__()
        
        # input layer of 500 neurons, hidden layer of 100 neurons and a output layer of 736 neurons
        self.fc1=nn.Linear(500,100)
        self.fc2=nn.Linear(100,736)
        self.dropout=nn.Dropout(0.2)# a dropout layer dropping 20 percent randomly selected neurons
        
    def forward(self,x):
        #x-shape-> [batch_size,500]
        #layer-1
        l1=self.fc1(x)#l1 shape->[batch_size,100]
        #activation-function-1
        al1=F.relu(l1)#al1 shaoe->[batch_size,100]
        #dropout layer
        al1=self.dropout(al1)
        #layer-2
        l2=self.fc2(al1)#l2.shape->[batch_size,736]
        
        return l2

In [22]:
def train(model,train_loader,optimizer,epoch):
    
    model.train()# telling the model to prapre for training
    
    for batchid,(data,target) in enumerate(train_loader): # getting the batch
        
        #converting the target to categorical variable
        y_categorical=target.argmax(dim=1,keepdim=True)
        y_categorical=torch.flatten(y_categorical)
        
        optimizer.zero_grad()#setting the cummulative gradients to zero
        output=model(data)#forward pass through the model
        
        loss=F.cross_entropy(output,y_categorical)# this function applies softmax activation function and then logloss
        loss.backward()#calculating gradients of the model
        optimizer.step()#updating model parameters
        
        if batchid % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batchid * len(data), len(train_loader.dataset),
            100. * batchid / len(train_loader), loss.item()))

In [23]:
def test(model,test_loader):
    
    model.eval()#telling the model to preapre for evaluation
    
    correct=0#variable to store total correct predictions
    
    with torch.no_grad():#to ensure gradients are not calculated as calculating gradients is not required for testing
        
        for data,target in test_loader:#getting the batch
            
            output=model(data)#forward pass shape->[batch_size,736]
            pred=output.argmax(dim=1,keepdim=True)#getting the index of max value in the prediction. shape->[batch_size,1]
            
            for i in range(len(pred)):
                if target[i][pred[i]]==1:# checking if the predicted value is one of the targets
                    correct+=1
        print(100*correct/len(test_loader.dataset))

In [24]:
def seed(seed_value):
    #this function removes randomness and makes everything deterministic
    #here we set the seed for torch.cuda,torch,numpy and random.
    #torch.cuda.manual_seed_all(seed_value) ,if we are using multi-GPU then we should use this to set the seed.
    torch.cuda.manual_seed_all(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)

In [25]:
def main():
    
    seed(0)#fixing the randomness of the code
    
    #passing the data into custom data loader
    data1=customdataloader(sent_train,label_train)
    data2=customdataloader(sent_test,label_test)
    
    train_loader=torch.utils.data.DataLoader(data1,num_workers=0,batch_size=30,shuffle=True)#getting train data loader
    test_loader=torch.utils.data.DataLoader(data2,num_workers=0,batch_size=50,shuffle=False)#getting test data loader
    
    model=Net()
    
    optimizer=optim.Adam(model.parameters(),lr=0.001)#choosing the optimizer and setting the learning rate
    
    for epoch in range(1,26):
        train(model,train_loader,optimizer,epoch)
        test(model,test_loader)

In [26]:
if __name__=="__main__":
    main()

13.209996754300551
30.931515741642325
38.78610840636157
42.648490749756576
47.127555988315486
49.33463161311263
51.0548523206751
52.64524505030834
53.29438493995456
53.74878286270691
54.52775073028238
54.46283674131775
54.75494969165855
54.75494969165855
55.11197663096397
54.98214865303473
55.047062641999354
55.047062641999354
54.94969165855242
54.56020772476469
54.75494969165855
54.62512171372931
54.78740668614086
54.04089581304771
53.521583901330736


In [27]:
'''
Tried by converting the sentences to Tf-Idf vectors by using inbuilt sklearn functions but accuracy was same

'''
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfTransformer
#bow_transformer=CountVectorizer(analyzer=text_process).fit(sentences)
#BOW=bow_transformer.transform(sentences)
#tfidf_transformer=TfidfTransformer().fit(BOW)
#BOW=tfidf_transformer.transform(BOW)
#from sklearn.decomposition import TruncatedSVD
#svd=TruncatedSVD(800)
#BOW=svd.fit_transform(BOW)