<a href="https://colab.research.google.com/github/natek-1/NLP-Projects/blob/main/nlp_neural_networks_and_deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Networks and Embeddings for Natural Language Processing

Outline:
- Download the Data
- Prepare Data for Training
- Logistic Regression Model
- Feed Forward Nueral Network


Dataset: https://www.kaggle.com/c/quora-insincere-questions-classification

## Download the Data

Upload your `kaggle.json` file to Colab

In [None]:
import os 

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = '.'

In [None]:
!kaggle competitions download -c quora-insincere-questions-classification -f train.csv
!kaggle competitions download -c quora-insincere-questions-classification -f test.csv
!kaggle competitions download -c quora-insincere-questions-classification -f sample_submission.csv

Downloading train.csv.zip to /content
 98% 54.0M/54.9M [00:02<00:00, 33.6MB/s]
100% 54.9M/54.9M [00:02<00:00, 25.4MB/s]
Downloading test.csv.zip to /content
 88% 14.0M/15.8M [00:01<00:00, 21.6MB/s]
100% 15.8M/15.8M [00:01<00:00, 16.4MB/s]
Downloading sample_submission.csv.zip to /content
 73% 3.00M/4.09M [00:00<00:00, 5.56MB/s]
100% 4.09M/4.09M [00:00<00:00, 6.08MB/s]


In [None]:
train_fname = './train.csv.zip'
test_fname = './test.csv.zip'
sub_fname = './sample_submission.csv.zip'

In [None]:
import pandas as pd

In [None]:
raw_df = pd.read_csv(train_fname)
test_df = pd.read_csv(test_fname)
sub_df = pd.read_csv(sub_fname)

In [None]:
SAMPLE_SIZE = 100_000
sample_df = raw_df.sample(SAMPLE_SIZE, random_state=42)

## Prepare Data for Training


Outline:
- Convert text to TF-IDF Vectors
- Split training & validation set
- Convert to PyTorch tensors

### Conversion to TF-IDF Vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stemmer = SnowballStemmer(language='english')
english_stopwords = stopwords.words('english')

In [None]:
def tokenize(text):
  return [stemmer.stem(token) for token in word_tokenize(text)]

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words=english_stopwords, max_features=1000, ngram_range=(1,3))

In [None]:
%%time
vectorizer.fit(sample_df.question_text)



CPU times: user 31.2 s, sys: 289 ms, total: 31.5 s
Wall time: 32.2 s


TfidfVectorizer(max_features=1000, ngram_range=(1, 3),
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                tokenizer=<function tokenize at 0x7ff28a04f280>)

### transform the questions into vectors

In [None]:
%%time
inputs = vectorizer.transform(sample_df.question_text)

CPU times: user 25.8 s, sys: 77.1 ms, total: 25.9 s
Wall time: 25.9 s


In [None]:
%%time
test_inputs = vectorizer.transform(test_df.question_text)

CPU times: user 1min 41s, sys: 293 ms, total: 1min 41s
Wall time: 1min 45s


### Split training and validation set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs, sample_df.target, test_size=0.3)

### Convert to PyTorch Tensors

In [None]:
import torch 

In [None]:
train_input_tensors = torch.tensor(train_inputs.toarray()).float()
val_input_tensors = torch.tensor(val_inputs.toarray()).float()
train_target_tensors = torch.tensor(train_targets.values).float()
val_target_tensors = torch.tensor(val_targets.values).float()

In [None]:
test_input_tensors = torch.tensor(test_inputs.toarray()).float()

In [None]:
from torch.utils.data import TensorDataset, DataLoader

In [None]:

train_ds = TensorDataset(train_input_tensors, train_target_tensors)
val_ds = TensorDataset(val_input_tensors, val_target_tensors)
test_ds = TensorDataset(test_input_tensors)

In [None]:
train_dl = DataLoader(train_ds, batch_size=128, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=128, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=128)

##Deep learning model 

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score

In [None]:
class QuoraNet(nn.Module):
  def __init__(self):
    super().__init__()
    self.layer1 = nn.Linear(1000, 512)
    self.layer2 = nn.Linear(512, 256)
    self.layer3= nn.Linear(256, 128)
    self.layer4= nn.Linear(128, 64)
    self.layer5 = nn.Linear(64, 1)
  
  def forward(self, inputs):
    out = self.layer1(inputs)
    out = F.relu(out)
    out = self.layer2(out)
    out = F.relu(out)
    out = self.layer3(out)
    out = F.relu(out)
    out = self.layer4(out)
    out = F.relu(out)
    out = self.layer5(out)
    return out


In [None]:
model = QuoraNet()

In [None]:
def evaluate(model, dl):
  with torch.no_grad():
    losses, accs, f1s = [], [], []
  #Loop over batches
    for inputs, targets in dl:
      outputs = model(inputs)

      prob = torch.sigmoid(outputs[:,0])

      loss = F.binary_cross_entropy(prob, targets)

      preds = (prob > 0.5).int()

      acc = accuracy_score(targets, preds)
      f1 = f1_score(targets, preds)
      losses.append(loss.item())
      f1s.append(f1.item())
      accs.append(acc.item())
  
  return torch.mean(torch.tensor(losses)).item(), torch.mean(torch.tensor(accs)).item(), torch.mean(torch.tensor(f1s)).item() 



In [None]:
evaluate(model, train_dl)

(0.6947637796401978, 0.06032906845211983, 0.1130298301577568)

In [None]:
evaluate(model, val_dl)

(0.6947630643844604, 0.06044991314411163, 0.11316610872745514)

In [None]:
def fit(epochs, lr, model, train_dl, val_dl):
  optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=1e-5)
  history = []

  for epoch in range(epochs):
    for inputs, targets in train_dl:
      outputs = model(inputs)
      probs = torch.sigmoid(outputs[:,0])

      loss = F.binary_cross_entropy(probs, targets)

      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    loss, acc, f1 = evaluate(model, val_dl)
    print(f'Epoch {epoch+1}, loss: {loss:.4}, Accuracy:{acc:.4}, f1: {f1:.4}')
    history.append([loss, acc, f1])

  return history
  


In [None]:
history = fit(5, 0.001, model, train_dl, val_dl)

Epoch 1, loss: 0.2002, Accuracy:0.9394, f10.3817
Epoch 2, loss: 0.2742, Accuracy:0.9325, f10.3778
Epoch 3, loss: 0.3637, Accuracy:0.9351, f10.4012
Epoch 4, loss: 0.3793, Accuracy:0.9397, f10.3595
Epoch 5, loss: 0.3993, Accuracy:0.9368, f10.3549


## Logistic Regression Model

## Feed Forward Neural Network

## Make Predictions and Submit