# Semtiment Analysis Data Processing

### Import packages

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

import time

### Load dataset
downloaded from https://www.kaggle.com/paoloripamonti/twitter-sentiment-analysis/data

In [2]:
sentiment_df = pd.read_csv(
    "database/sentiment_training.zip",
    header=None,
    names=["emotion", "text"],
    usecols=[0, 5],
    encoding = "ISO-8859-1"
)

sentiment_df["emotion"] = sentiment_df["emotion"].apply(lambda x: x // 2)
sentiment_df = sentiment_df[["text", "emotion"]]

### Train Test Split

I am thinking about a 10-fold/20-fold cross validation, so I did not explicitly generate a validation set.

In [3]:
X = sentiment_df[["text"]]
y = sentiment_df["emotion"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

### Special accomodations vectorizer cannot do (Runtime: ~3min)

In [4]:
def tweets_special_clean(df):
    def clean(text):
        text = re.compile("(@[\w]+|https?://[\w\./]+|)").sub("", text) # Remove @ and urls, keep hashtags
        text = re.compile("[^\w]+ ").sub(" ", text) # Remove puncutations
        return " ".join([lemmatizer.lemmatize(word) for word in text.split(" ")]) # Lemmatizer
    
    lemmatizer = WordNetLemmatizer()
    return df.assign(text=df["text"].apply(clean))

In [5]:
X_train = tweets_special_clean(X_train)
X_test = tweets_special_clean(X_test)

### Vectorizer (Runtime: ~30sec)

In [6]:
vectorizer = CountVectorizer(
    strip_accents="unicode",
    stop_words=set(stopwords.words('english')),
    ngram_range=(1,3),
    min_df=500,
    binary=True
)

In [11]:
mat_X_train = vectorizer.fit_transform(X_train["text"])
mat_X_test = vectorizer.transform(X_test["text"])
column_names = vectorizer.get_feature_names()

In [14]:
mat_X_train = mat_X_train.loc[:10000]
y_train = y_train.loc[:10000]
mat_X_test = mat_X_test.loc[:10000]
y_test = y_test.loc[:10000]

AttributeError: loc not found

In [9]:
np.savetxt('X_train.csv', mat_X_train.toarray(), delimiter=",")
np.savetxt('X_test.csv', mat_X_test.toarray(), delimiter=",")
np.savetxt('y_train.csv', np.array(y_train), delimiter=",")
np.savetxt('y_test.csv', np.array(y_test), delimiter=",")

MemoryError: Unable to allocate 30.9 GiB for an array with shape (1440000, 2882) and data type int64