In [44]:
from platform import python_version
print(python_version())

3.8.15


In [45]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Conv1D, MaxPooling1D
from keras.layers import Embedding

In [46]:
tf.__version__

'2.10.0'

In [47]:
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from matplotlib import pyplot

In [50]:
from charset_normalizer import from_path

file_path = "Dataset/SentimentAnalysisDataset1.csv"
result = from_path(file_path).best()
print("Detected encoding:", result.encoding)

df1 = pd.read_csv(file_path, encoding=result.encoding)

print(df1.head())

Detected encoding: iso8859_10
   ItemID  Sentiment                                      SentimentText
0       1          0                       is so sad for my APL frie...
1       2          0                     I missed the New Moon trail...
2       3          1                            omg its already 7:30 :O
3       4          0            .. Omgaga. Im sooo  im gunna CRy. I'...
4       5          0           i think mi bf is cheating on me!!!   ...


In [79]:
file_path = "Dataset/SentimentAnalysisDataset2.csv"
with open(file_path, 'rb') as f:
    content = f.read()

with open("Dataset/SentimentAnalysisDataset2_UTF8.csv", 'wb') as f:
    f.write(content.decode('cp1252', errors='replace').encode('utf-8'))

df2 = pd.read_csv("Dataset/SentimentAnalysisDataset2_UTF8.csv")
print(df2.head())

   ItemID  Sentiment                                      SentimentText
0       1          0                       is so sad for my APL frie...
1       2          0                     I missed the New Moon trail...
2       3          1                            omg its already 7:30 :O
3       4          0            .. Omgaga. Im sooo  im gunna CRy. I'...
4       5          0           i think mi bf is cheating on me!!!   ...


In [58]:
import pandas as pd

train_df = pd.concat([df1, df2], ignore_index=True)

print(train_df)

         ItemID  Sentiment                                      SentimentText
0             1          0                       is so sad for my APL frie...
1             2          0                     I missed the New Moon trail...
2             3          1                            omg its already 7:30 :O
3             4          0            .. Omgaga. Im sooo  im gunna CRy. I'...
4             5          0           i think mi bf is cheating on me!!!   ...
...         ...        ...                                                ...
1148559   99996          0  @Cupcake  seems like a repeating problem   hop...
1148560   99997          1  @cupcake__ arrrr we both replied to each other...
1148561   99998          0                     @CuPcAkE_2120 ya i thought so 
1148562   99999          1  @Cupcake_Dollie Yes. Yes. I'm glad you had mor...
1148563  100000          1                    @cupcake_kayla haha yes you do 

[1148564 rows x 3 columns]


In [80]:
train_df.shape

(1148564, 3)

In [66]:
train_df.columns

Index(['ItemID', 'Sentiment', 'SentimentText'], dtype='object')

In [67]:
train_df.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [68]:
train_df = train_df[["SentimentText","Sentiment"]]

In [69]:
train_df.head()

Unnamed: 0,SentimentText,Sentiment
0,is so sad for my APL frie...,0
1,I missed the New Moon trail...,0
2,omg its already 7:30 :O,1
3,.. Omgaga. Im sooo im gunna CRy. I'...,0
4,i think mi bf is cheating on me!!! ...,0


In [73]:
train_df.to_csv("Dataset/SentimentAnalysisDataset.csv", index=False)

In [74]:
train_df.Sentiment.unique()

array([0, 1])

In [75]:
train_df.Sentiment.value_counts()

1    610927
0    537637
Name: Sentiment, dtype: int64

In [76]:
Y = train_df['Sentiment']

### Explanation of the Code

Here’s a breakdown of the two lines of code and what they do:

#### **Code:**
```python
train_df['sentence_no_punctuation'] = train_df['SentimentText'].str.replace('[^\w\s]','')
```

**What it does:**
- This creates a new column in the DataFrame called `sentence_no_punctuation`.
- It removes all punctuation from the text in the `SentimentText` column.
  - **`str.replace()`**: Applies a string replacement to all rows in the column.
  - **`[^\w\s]`**: This is a **regular expression (regex)** pattern that matches any character **except** word characters (`\w`) or whitespace (`\s`).
  - **`''`**: Replaces the matched characters (punctuation) with an empty string.

---

#### **Code:**
```python
train_df['sentence_no_punctuation'] = train_df["sentence_no_punctuation"].fillna("fillna")
```

**What it does:**
- This replaces any missing (`NaN`) values in the `sentence_no_punctuation` column with the string `"fillna"`.
- **`fillna("fillna")`**: 
  - Ensures that the column has no `NaN` values, replacing them with the string `"fillna"`.
  - Useful for handling missing data when working with text processing.

---

### **Purpose**
1. **Remove punctuation**: To simplify text data for tasks like sentiment analysis or tokenization.
2. **Handle missing values**: Ensures the new column is consistent and doesn’t contain any `NaN` values, which might cause errors during analysis.

---

### **Result**
After executing this code:
- The column `sentence_no_punctuation` will contain the same text as `SentimentText`, but with all punctuation removed.
- Any `NaN` values in this new column will be replaced by `"fillna"`.

In [77]:
train_df['sentence_no_punctuation'] = train_df['SentimentText'].str.replace('[^\w\s]','')
train_df['sentence_no_punctuation'] = train_df["sentence_no_punctuation"].fillna("fillna")

  train_df['sentence_no_punctuation'] = train_df['SentimentText'].str.replace('[^\w\s]','')


In [85]:
max_features = 656158
maxlen = 600

### Explanation of the Code

#### **Code:**
```python
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features)
```

**What it does:**
- Creates an instance of the `Tokenizer` class from TensorFlow's `keras.preprocessing.text` module.
- This tokenizer is used for preparing text data for machine learning models, specifically for converting text into numerical sequences.

---

### **Parameters**

- **`num_words=max_features`**: 
  - Specifies the maximum number of unique words (vocabulary size) to keep when tokenizing the text.
  - Only the `max_features` most frequent words in the text corpus will be retained.
  - Any words beyond this limit will be ignored during tokenization.

---

### **Key Features of the `Tokenizer`**
1. **Builds a word index**: Maps each unique word in the text corpus to a unique integer index.
2. **Filters out rare words**: Keeps only the most frequent `max_features` words.
3. **Tokenizes text**: Converts sequences of words into sequences of integers based on the word index.
4. **Handles out-of-vocabulary words**: Words not in the top `max_features` are replaced with a special token (if configured).

---

### **Purpose**
This is typically the first step in processing text data for machine learning tasks like sentiment analysis, text classification, or language modeling. By tokenizing text into numerical sequences, the data becomes compatible with numerical models, such as neural networks.

In [86]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words = max_features)

In [87]:
tok.fit_on_texts(list(train_df['sentence_no_punctuation']))

### Explanation of the Code

#### **Code:**
```python
print(len(tok.word_index))
vocab_size = len(tok.word_index) + 1
```

---

### **Step-by-Step Explanation**

1. **`len(tok.word_index)`**
   - The `tok.word_index` is a dictionary created by the `Tokenizer` that maps each unique word in the text corpus to a unique integer index.
   - `len(tok.word_index)` returns the total number of unique words (or tokens) identified in the text corpus.

---

2. **`vocab_size = len(tok.word_index) + 1`**
   - The vocabulary size (`vocab_size`) is defined as the total number of unique words **plus one**.
   - Why **`+1`**? 
     - The additional "1" accounts for the **reserved index 0**, which is often used in deep learning models to represent padding tokens (for aligning sequences of different lengths).

---

### **Purpose**
- Printing `len(tok.word_index)` helps understand the total vocabulary size identified by the tokenizer.
- Adding 1 ensures the vocabulary size includes a reserved index for padding, which is commonly required in neural networks for sequence processing.

In [88]:
print(len(tok.word_index))
vocab_size = len(tok.word_index) + 1

656158
