In [None]:
import torch.optim as optim

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
import pandas as pd
import numpy as np
import regex as re
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from collections import defaultdict
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords

from sklearn import model_selection
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
#from pycontractions import pycontractions


np.random.seed(666)
df = pd.read_csv("english_dataset.tsv", sep="\\t", engine='python')

df["text"].dropna(inplace = True)
df["text"] = [re.sub('\d', " ", i) for i in df["text"]]
df["text"] = [re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", i) for i in df['text']]

df["text"] = [word_tokenize(i) for i in df["text"]]

tag_dict = defaultdict(lambda: wn.NOUN)
tag_dict["J"] = wn.ADJ
tag_dict["V"] = wn.VERB
tag_dict["R"] = wn.ADV

for index, val in enumerate(tqdm(df["text"])):
    res = []
    word_lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(val):
        if word not in stopwords.words("english") and word.isalpha():
            word_final = word_lemmatized.lemmatize(word, tag_dict[tag[0]])
            res.append(word_final)
    df.loc[index, "text_final"] = str(res).lower()

Train_X, Test_X, Train_Y1, Test_Y1, Train_Y2, Test_Y2, Train_Y3, Test_Y3 = model_selection.train_test_split(df['text_final'],df['task_1'],df['task_2'],df['task_3'],test_size=0.3)

Encoder = LabelEncoder()
Train_Y1 = Encoder.fit_transform(Train_Y1)
Test_Y1 = Encoder.fit_transform(Test_Y1)
Train_Y2 = Encoder.fit_transform(Train_Y2)
Test_Y2 = Encoder.fit_transform(Test_Y2)
Train_Y3 = Encoder.fit_transform(Train_Y3)
Test_Y3 = Encoder.fit_transform(Test_Y3)

pickle.dump(Train_Y1, open("TrainY1.pkl", "wb"))
pickle.dump(Test_Y1, open("TestY1.pkl", "wb"))
pickle.dump(Train_Y2, open("TrainY2.pkl", "wb"))
pickle.dump(Test_Y2, open("TestY2.pkl", "wb"))
pickle.dump(Train_Y3, open("TrainY3.pkl", "wb"))
pickle.dump(Test_Y3, open("TestY3.pkl", "wb"))

tfidf_vector = TfidfVectorizer(max_features= 5000)
tfidf_vector.fit(df["text_final"])

Train_X_Tfidf = tfidf_vector.transform(Train_X)
Test_X_Tfidf = tfidf_vector.transform(Test_X)

pickle.dump(Train_X_Tfidf, open("TrainXTfidf.pkl", "wb"))
pickle.dump(Test_X_Tfidf, open("TestXTfidf.pkl", "wb"))



100%|██████████| 5852/5852 [00:36<00:00, 159.91it/s]


In [None]:
len(Train_X)

4096

In [None]:
len(Train_X[0])

242

In [None]:
tfidf_vector.get_feature_names()

['aajtak',
 'abandon',
 'abc',
 'ability',
 'able',
 'abortion',
 'about',
 'abpanandatv',
 'abpnewstv',
 'abrandnewdayoutnow',
 'absence',
 'absolute',
 'absolutely',
 'abt',
 'abu',
 'abuse',
 'abuser',
 'abusing',
 'abusive',
 'ac',
 'accept',
 'acceptable',
 'access',
 'accident',
 'accidentally',
 'accomplish',
 'accord',
 'according',
 'account',
 'accountability',
 'accountable',
 'acct',
 'accuse',
 'accused',
 'achieve',
 'achievement',
 'acid',
 'acosta',
 'across',
 'act',
 'actbrigitte',
 'action',
 'activism',
 'activist',
 'activity',
 'actor',
 'actual',
 'actually',
 'ad',
 'adam',
 'adamcbest',
 'add',
 'additional',
 'address',
 'adgpi',
 'adhir',
 'adhirrcinc',
 'adityarajkaul',
 'adivce',
 'admin',
 'administration',
 'admire',
 'admit',
 'admitted',
 'adult',
 'adulterer',
 'advance',
 'advantage',
 'adversary',
 'advice',
 'advisor',
 'advocate',
 'af',
 'affair',
 'affect',
 'afford',
 'afg',
 'afghanistan',
 'afraid',
 'africa',
 'afridarahmanali',
 'after',
 'a

In [None]:
Train_X_Tfidf

<4096x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 53300 stored elements in Compressed Sparse Row format>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self, sentence_size, hidden_nodes):
        super(Net, self).__init__()
       
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(sentence_size, hidden_nodes)
        self.fc2 = nn.Linear(hidden_nodes, hidden_nodes//2)
        self.fc3 = nn.Linear(hidden_nodes//2, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        #x = self.dropout(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net(sentence_size = 5000, hidden_nodes = 100)

In [None]:
print(net)

Net(
  (fc1): Linear(in_features=5000, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
type(Train_X_Tfidf.todense())

numpy.matrix

In [None]:
len(Train_Y1)

4096

In [None]:
Train_Y1 = Train_Y1.reshape((len(Train_Y1),1))

In [None]:
epochs = 500
input = Train_X_Tfidf.todense().astype(np.float32)
target = torch.from_numpy(Train_Y1.astype(np.float32))
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.MSELoss()
optimizer.zero_grad()
for epoch in range(epochs):
  output = net(torch.from_numpy(input))
  loss = criterion(output, target)
  loss.backward()
  optimizer.step() 
  print(loss)

tensor(0.4850, grad_fn=<MseLossBackward>)
tensor(0.4733, grad_fn=<MseLossBackward>)
tensor(0.4511, grad_fn=<MseLossBackward>)
tensor(0.4204, grad_fn=<MseLossBackward>)
tensor(0.3842, grad_fn=<MseLossBackward>)
tensor(0.3458, grad_fn=<MseLossBackward>)
tensor(0.3090, grad_fn=<MseLossBackward>)
tensor(0.2773, grad_fn=<MseLossBackward>)
tensor(0.2536, grad_fn=<MseLossBackward>)
tensor(0.2402, grad_fn=<MseLossBackward>)
tensor(0.2376, grad_fn=<MseLossBackward>)
tensor(0.2458, grad_fn=<MseLossBackward>)
tensor(0.2640, grad_fn=<MseLossBackward>)
tensor(0.2902, grad_fn=<MseLossBackward>)
tensor(0.3220, grad_fn=<MseLossBackward>)
tensor(0.3564, grad_fn=<MseLossBackward>)
tensor(0.3900, grad_fn=<MseLossBackward>)
tensor(0.4194, grad_fn=<MseLossBackward>)
tensor(0.4420, grad_fn=<MseLossBackward>)
tensor(0.4556, grad_fn=<MseLossBackward>)
tensor(0.4590, grad_fn=<MseLossBackward>)
tensor(0.4520, grad_fn=<MseLossBackward>)
tensor(0.4354, grad_fn=<MseLossBackward>)
tensor(0.4110, grad_fn=<MseLossBac

In [None]:
temp = -1
threshold = -1
for i in np.arange(0, 1, 0.01):
  temp_sum = (sum(target == (output > i)))
  if temp <= temp_sum:
    temp = temp_sum
    threshold = i

In [None]:
y_pred = np.array(output > threshold, dtype = np.float32)

In [None]:
y_pred

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)

In [None]:
import sklearn
sklearn.metrics.confusion_matrix(Train_Y1, y_pred)
sklearn.metrics.f1_score(Train_Y1, y_pred)

0.7601029211442409

In [None]:
input = Test_X_Tfidf.todense().astype(np.float32)
Test_Y1  = Test_Y1.reshape((len(Test_Y1),1))
target = torch.from_numpy(Test_Y1.astype(np.float32))
output = net(torch.from_numpy(input))
loss = criterion(output, target)
print(loss)

tensor(0.2781, grad_fn=<MseLossBackward>)


In [None]:
y_pred = np.array(output > threshold, dtype = np.float32)
sklearn.metrics.confusion_matrix(Test_Y1, y_pred)

array([[   0,  676],
       [   0, 1080]])

In [None]:
sklearn.metrics.f1_score(Test_Y1, y_pred)

0.7616361071932299