<a href="https://colab.research.google.com/github/mojtabaSefidi/DataScience-SmallProjects/blob/master/Sentiment_Analysis_Using_DeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q wordcloud
!pip install -q tqdm
!pip install -q emoji
!pip install -q swifter
!pip install -q sentence-transformers

In [2]:
import os
import pandas as pd
import numpy as np
import string
import nltk
import re
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
import tqdm
import emoji
import swifter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import torch

## **Dataset**

### **Capturing dataset from Kaggle**

In [None]:
# !gdown 1R8waoO4GA-0SiyfadnSDcY4FeuNkTV3A
# ! pip install -q kaggle
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json
# ! kaggle datasets download -d kritanjalijain/amazon-reviews
# !unzip /content/amazon-reviews.zip
# os.rename('test.csv', 'Amazon_Review_Test.csv')
# os.rename('train.csv', 'Amazon_Review_Train.csv')
# os.remove("amazon-reviews.zip")

Downloading...
From: https://drive.google.com/uc?id=1R8waoO4GA-0SiyfadnSDcY4FeuNkTV3A
To: /content/kaggle.json
100% 73.0/73.0 [00:00<00:00, 176kB/s]
Downloading amazon-reviews.zip to /content
100% 1.29G/1.29G [00:40<00:00, 37.0MB/s]
100% 1.29G/1.29G [00:40<00:00, 34.0MB/s]
Archive:  /content/amazon-reviews.zip
  inflating: amazon_review_polarity_csv.tgz  
  inflating: test.csv                
  inflating: train.csv               


In [3]:
!gdown 1R8waoO4GA-0SiyfadnSDcY4FeuNkTV3A
! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d yasserh/twitter-tweets-sentiment-dataset
!unzip /content/twitter-tweets-sentiment-dataset.zip
os.rename('Tweets.csv', 'Sentiment_Analysis_Dataset.csv')
os.remove("/content/twitter-tweets-sentiment-dataset.zip")

Downloading...
From: https://drive.google.com/uc?id=1R8waoO4GA-0SiyfadnSDcY4FeuNkTV3A
To: /content/kaggle.json
  0% 0.00/73.0 [00:00<?, ?B/s]100% 73.0/73.0 [00:00<00:00, 277kB/s]
mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading twitter-tweets-sentiment-dataset.zip to /content
  0% 0.00/1.23M [00:00<?, ?B/s]
100% 1.23M/1.23M [00:00<00:00, 90.1MB/s]


### **Inroduction**

In [83]:
dataset = pd.read_csv('/content/Sentiment_Analysis_Dataset.csv')
dataset = dataset.dropna(axis=0).reset_index(drop=True)
print(f'There are {len(dataset)} samples in the dataset.')

There are 27480 samples in the dataset.


In [84]:
dataset.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


### **Analysis**

## **Pre-Processing**

In [85]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [89]:
def process(text,
            remove_punctuaction=True,
            remove_stopwords=False):
  text = text.lower()
  text = emoji.demojize(text)
  text = re.sub(r'(?:\@|https?\://)\S+', '', text, flags=re.MULTILINE)
  text = re.sub(r"what's", "what is ", text)
  text = re.sub(r"'s", " ", text)
  text = re.sub(r"'ve", " have ", text)
  text = re.sub(r"can't", "can not ", text)
  text = re.sub(r"n't", " not ", text)
  text = re.sub(r"i'm", "i am ", text)
  text = re.sub(r"'re", " are ", text)
  text = re.sub(r"'d", " would ", text)
  text = re.sub(r"'ll", " will ", text)
  if remove_punctuaction:
    text = text.translate(str.maketrans(' ', ' ', string.punctuation))
  if remove_stopwords:
    text = ' '.join([word for word in text.split() if word not in stopwords])
  text = re.sub(' +', ' ', text)
  return text

In [90]:
def label_encoding(labels):
  le = preprocessing.LabelEncoder()
  return le.fit_transform(labels), list(le.classes_)

In [91]:
dataset['text'] = dataset['text'].swifter.apply(lambda x: process(x))
dataset['sentiment'], classes = label_encoding(dataset['sentiment'])
classes

Pandas Apply:   0%|          | 0/27480 [00:00<?, ?it/s]

['negative', 'neutral', 'positive']

In [92]:
x_train_text, x_test_text, y_train, y_test = train_test_split(dataset['text'].to_numpy(),
                                                              dataset['sentiment'].to_numpy(),
                                                              test_size=0.2,
                                                              stratify=dataset['sentiment'].to_numpy())

## **Text Representation**

### **TF-IDF**

In [93]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(x_train_text)
X_test_tfidf = vectorizer.transform(x_test_text)

### **Text2Sequence**

In [94]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train_text)
X_train = tokenizer.texts_to_sequences(x_train_text)
X_train = pad_sequences(X_train, padding='post', maxlen=300)
X_test = tokenizer.texts_to_sequences(x_test_text)
X_test = pad_sequences(X_test, padding='post', maxlen=300)

### **Continuous Bag of Words (CBOW)**

### **Glove**

In [95]:
!gdown https://huggingface.co/stanfordnlp/glove/resolve/main/glove.42B.300d.zip
!unzip /content/glove.42B.300d.zip

Downloading...
From: https://huggingface.co/stanfordnlp/glove/resolve/main/glove.42B.300d.zip
To: /content/glove.42B.300d.zip
 22% 413M/1.88G [00:04<00:44, 33.2MB/s]Traceback (most recent call last):
  File "/usr/local/bin/gdown", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/gdown/cli.py", line 151, in main
    filename = download(
  File "/usr/local/lib/python3.10/dist-packages/gdown/download.py", line 275, in download
    f.write(chunk)
KeyboardInterrupt
 22% 419M/1.88G [00:04<00:17, 85.1MB/s]
^C
Archive:  /content/glove.42B.300d.zip
replace glove.42B.300d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
embedding_vector_glove = {}
glove_file = open('/content/glove.42B.300d.txt', encoding="utf8")
for line in glove_file:
  value = line.split(' ')
  word = value[0]
  embedding_vector_glove[word] = np.array(value[1:], dtype='float32')
glove_file.close()

In [None]:
vocab_size = len(tokenizer.word_index)+1
c=0
embedding_matrix_glove = np.zeros((x_train_text.shape[0],300))
for i, sentence in enumerate(x_train_text):
  embedding_vector = np.zeros((1, 300))
  tokens = sentence.split(' ')
  length = len(tokens)

  for token in tokens:
    try:
      embedding_vector += embedding_vector_glove.get(token)
    except:
      length -= 1
  if length<1:
    print(sentence)
  embedding_matrix_glove[i] = embedding_vector/length

In [None]:
embedding_matrix_glove.shape

In [None]:
X_train.shape

### **Word2vec**

### **Bert Pretrained Embedding**

In [68]:
if torch.cuda.is_available():
  bert_model = SentenceTransformer('bert-base-uncased',device='cuda')
else:
  bert_model = SentenceTransformer('bert-base-uncased', device='cpu')



In [70]:
embedding_matrix_bert = bert_model.encode(x_train_text)

KeyboardInterrupt: ignored

### **Visualization**

## **Baseline Model**

### **Model Architecture**

### **Model Training & Evaluation**

### **Comparison Study**

### **Comparing with pre-trained model**

## **Inference**