In [1]:


import warnings
warnings.filterwarnings("ignore")

%pip install -r requirements.txt
%pip install -U pip setuptools wheel
%pip install -U "spacy[apple]"
!python3 -m spacy download en_core_web_sm




Note: you may need to restart the kernel to use updated packages.
Collecting setuptools
  Using cached setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Using cached setuptools-80.9.0-py3-none-any.whl (1.2 MB)
Installing collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 58.0.4
    Uninstalling setuptools-58.0.4:
      Successfully uninstalled setuptools-58.0.4
Successfully installed setuptools-80.9.0
Note: you may need to restart the kernel to use updated packages.
Collecting spacy[apple]
  Using cached spacy-3.8.11-cp39-cp39-macosx_10_9_universal2.whl
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy[apple])
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy[apple])
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy[apple])
  Using cached murmurhash-1.0.15-cp39-cp39-macosx_11_0_arm64

In [3]:

import pandas as pd
import numpy as np

import tensorflow as tf

import re
import spacy

from tqdm import tqdm
tqdm.pandas()



# Part 1 - Data Processing

## 1.1 Loading the dataset & 1.2 Data Cleaning

In [4]:

def load_and_setup_data() -> pd.DataFrame:
    df = pd.read_csv("twitter_training.csv")    
    # First column appers to be some sort file or sequence number and the second appears to be the source
    # Those two columns do not have any impact on sentiment analysis
    # Take the last two columns
    df = df.iloc[:, -2:]
    # Swap columns 1 and 2
    df = df[[df.columns[1], df.columns[0]]]
    # Setup column names
    df.columns = ["tweet", "sentiment"]
    # Remove empty rows
    df = df.dropna()
    # Remove diuplicate rows
    df = df.drop_duplicates()
    # Look for tweets where the same tweet is classified as a different sentiment
    # Take the first occurence - this will get us clean data and will not mislead the classifier later during training
    df = df.drop_duplicates(subset=["tweet"], keep="first")
    return df

df = load_and_setup_data()


# Just keep the essentials, remove named entities, parsing and sentence segmentation for speed
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "senter"])


def clean_and_pre_process(text: str) -> str:    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    # Remove user @ references and '#' from hashtags
    text = re.sub(r"\@\w+|\#", "", text)
    # Remove special characters and numbers (keep only letters)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Convert to lowercase
    text = text.lower()    
    # Tokenization, Stop Word Removal, and Lemmatization via spaCy
    doc = nlp(text)    
    # Filter out stop words and punctuation, then take the lemma
    cleaned_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]    
    # Join back into a string 
    return " ".join(cleaned_tokens)

df["sanitized_tweet"] = df["tweet"].progress_apply(clean_and_pre_process)

100%|██████████| 69490/69490 [00:47<00:00, 1476.60it/s]


## 1.3 Feature Engineering

In [8]:
def create_tokenized_words(df: pd.DataFrame) -> pd.DataFrame:
    texts = df["sanitized_tweet"].values
    # TF-IDF vectorizer
    tfidf_vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=10000,
        output_mode="tf_idf"
    )
    tfidf_vectorizer.adapt(texts)
    # Convert text to TF-IDF vectors
    tfidf_vectors = tfidf_vectorizer(texts)
    # add the tokenized words as a new column
    df["sanitized_tweet_vector"] = list(tfidf_vectors.numpy())
    return df

df = create_tokenized_words(df)
df.head()

Unnamed: 0,tweet,sentiment,sanitized_tweet,tfidf_vector,sanitized_tweet_vector
0,I am coming to the borders and I will kill you...,Positive,come border kill,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,im getting on borderlands and i will kill you ...,Positive,m get borderland kill,"[0.0, 0.0, 0.0, 0.0, 0.0, 2.6871612, 0.0, 2.91...","[0.0, 0.0, 0.0, 0.0, 0.0, 2.6871612, 0.0, 2.91..."
2,im coming on borderlands and i will murder you...,Positive,m come borderland murder,"[0.0, 0.0, 0.0, 0.0, 0.0, 2.6871612, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 2.6871612, 0.0, 0.0,..."
3,im getting on borderlands 2 and i will murder ...,Positive,m get borderland murder,"[0.0, 0.0, 0.0, 0.0, 0.0, 2.6871612, 0.0, 2.91...","[0.0, 0.0, 0.0, 0.0, 0.0, 2.6871612, 0.0, 2.91..."
4,im getting into borderlands and i can murder y...,Positive,m get borderland murder,"[0.0, 0.0, 0.0, 0.0, 0.0, 2.6871612, 0.0, 2.91...","[0.0, 0.0, 0.0, 0.0, 0.0, 2.6871612, 0.0, 2.91..."
