In [None]:
### using the spacy for processing text and torch text pipelines and pandas for exploring and cleaning our data

# Step 2: Download the dataset using wget
!wget https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

# Step 3: Unzip the file
!unzip trainingandtestdata.zip

--2024-11-17 17:50:40--  https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81363704 (78M) [application/zip]
Saving to: ‘trainingandtestdata.zip.1’


2024-11-17 17:50:52 (6.34 MB/s) - ‘trainingandtestdata.zip.1’ saved [81363704/81363704]

Archive:  trainingandtestdata.zip
replace testdata.manual.2009.06.14.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import pandas as pd
# Step 4: Load the CSV into a DataFrame
# The dataset file is 'training.1600000.processed.noemoticon.csv'
tweetsDF = pd.read_csv("training.1600000.processed.noemoticon.csv",
                       header=None,
                       encoding='latin-1')  # Encoding ensures proper loading of special characters

# Step 5: Display the first few rows
tweetsDF.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
tweetsDF[0].value_counts()

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
0,800000
4,800000


In [None]:
tweetsDF["sentiment_cat"] = tweetsDF[0].astype("category")
tweetsDF["sentiment"] = tweetsDF["sentiment_cat"].cat.codes
tweetsDF.head()

Unnamed: 0,0,1,2,3,4,5,sentiment_cat,sentiment
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,0
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,0,0
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0,0
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,0,0
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",0,0


In [None]:
tweetsDF.to_csv("train-processed.csv", header=None, index=None)
tweetsDF.sample(10000).to_csv("train-processed-sample.csv", header=None, index=None)

In [None]:
!pip uninstall -y torchtext
!pip install torchtext==0.6.0

Found existing installation: torchtext 0.6.0
Uninstalling torchtext-0.6.0:
  Successfully uninstalled torchtext-0.6.0
Collecting torchtext==0.6.0
  Using cached torchtext-0.6.0-py3-none-any.whl.metadata (6.3 kB)
Using cached torchtext-0.6.0-py3-none-any.whl (64 kB)
Installing collected packages: torchtext
Successfully installed torchtext-0.6.0


In [None]:
!python --version

Python 3.10.12


In [None]:
import torch
torch.__version__

'2.1.2+cu121'

In [None]:
import torchtext

torchtext.__version__

'0.6.0'

In [None]:
from torchtext import data

LABEL = data.LabelField()
TWEET = data.Field(lower=True)

In [None]:
fields = [('score',None), ('id',None),('date',None),('query',None),
 ('name',None),
 ('tweet', TWEET),('category',None),('label',LABEL)]

In [None]:
twitterDataset = torchtext.data.TabularDataset(
 path="train-processed.csv",
 format="CSV",
 fields=fields,
 skip_header=False)

In [None]:
(train, test, valid) = twitterDataset.split(split_ratio=[0.8, 0.1, 0.1])

(len(train),len(test),len(valid))

(1280000, 160000, 160000)

In [None]:
vars(train.examples[7])

{'tweet': ['off', 'to', 'the', 'magical', 'land', 'of', 'work'], 'label': '0'}

In [None]:
vocab_size = 20000
TWEET.build_vocab(train, max_size = vocab_size)

In [None]:
len(TWEET.vocab)

20002

In [None]:
TWEET.vocab.freqs.most_common(10)

[('i', 597357),
 ('to', 448579),
 ('the', 415213),
 ('a', 301099),
 ('my', 250401),
 ('and', 236517),
 ('you', 190594),
 ('is', 184651),
 ('for', 171475),
 ('in', 167981)]

In [None]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
 (train, valid, test),
 batch_size = 32)

In [None]:
import torch.nn as nn

class Lstm(nn.Module):
  def __init__(self, hidden_size, embedding_dim, vocab_size):
    super(Lstm, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.encoder = nn.LSTM(input_size=embedding_dim,
                           hidden_size=hidden_size, num_layers=1)
    self.predictor = nn.Linear(hidden_size, 2)

  def forward(self, seq):
    output, (hidden,_) = self.encoder(self.embedding(seq))
    preds = self.predictor(hidden.squeeze(0))
    return preds

model = Lstm(100,300,20002)

In [None]:
from torch import optim

optimizer = optim.Adam(model.parameters(), lr=2e-2)
criterion = nn.CrossEntropyLoss()

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

In [None]:
def train(epochs, model, optimizer, criterion, train_iterator, valid_iterator):
  for epoch in range(1, epochs + 1):
    training_loss = 0.0
    valid_loss = 0.0
    model.train()

    for batch_idx, batch in enumerate(train_iterator):
      optimizer.zero_grad()
      predict = model(batch.tweet)
      loss = criterion(predict, batch.label)
      loss.backward()
      optimizer.step()
      training_loss += loss.data.item() * batch.tweet.size(0)

    training_loss /= len(train_iterator)
    print('Epoch: {}, Training Loss: {:.2f}'.format(epoch, training_loss))

    model.eval()
    for batch_idx, batch in enumerate(valid_iterator):
      predict = model(batch.tweet)
      loss = criterion(predict, batch.label)
      valid_loss += loss.data.item() * batch.tweet.size(0)

    valid_loss /= len(valid_iterator)
    print('Epoch: {}, Validation Loss: {:.2f}'.format(epoch, valid_loss))

train(5, model, optimizer, criterion, train_iterator, valid_iterator)

AttributeError: 'LabelField' object has no attribute 'vocab'

In [None]:
##### back translation for text augmuntation

!pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.11.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->goog

In [None]:
from googletrans import Translator

# Initialize the translator
translator = Translator()

# Text to translate
sentences = ['The cat sat on the mat']

# Translate each sentence to French
translations_fr = [translator.translate(sentence, dest='fr').text for sentence in sentences]
print("French Translations:", translations_fr)

# Translate the French translations back to English
translations_en = [translator.translate(text, dest='en').text for text in translations_fr]
print("Back to English:", translations_en)


French Translations: ['Le chat était assis sur le tapis']
Back to English: ['The cat was sitting on the carpet']


In [None]:
import random
from googletrans import Translator, LANGUAGES

# Initialize the translator
translator = Translator()

# Define the sentences to translate
sentences = ['The cat sat on the mat']

# Select a random language
available_langs = list(LANGUAGES.keys())
random_lang = random.choice(available_langs)

print(f"Translating to {LANGUAGES[random_lang]} ({random_lang})")

# Translate each sentence to the random language
translations = [translator.translate(sentence, dest=random_lang).text for sentence in sentences]
print("Translated Text:", translations)

# Translate back to English
translations_en = [translator.translate(text, src=random_lang, dest='en').text for text in translations]
print("Back to English:", translations_en)

Translating to yoruba (yo)
Translated Text: ['Ologbo joko lori akete']
Back to English: ['Cats sitting on the couch']
