# Semtiment Analysis Model

### Import packages

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

import time

### Load dataset
downloaded from https://www.kaggle.com/paoloripamonti/twitter-sentiment-analysis/data

In [2]:
sentiment_df = pd.read_csv(
    "database/sentiment_training.zip",
    header=None,
    names=["emotion", "text"],
    usecols=[0, 5],
    encoding = "ISO-8859-1"
)

sentiment_df["emotion"] = sentiment_df["emotion"].apply(lambda x: x // 2)
sentiment_df = sentiment_df[["text", "emotion"]]

### Special accomodations vectorizer cannot do (Runtime: ~3min)

In [5]:
def tweets_special_clean(df):
    def clean(text):
        text = re.compile("(@[\w]+|https?://[\w\./]+|)").sub("", text) # Remove @ and urls, keep hashtags
        text = re.compile("[^\w]+ ").sub(" ", text) # Remove puncutations
        return " ".join([lemmatizer.lemmatize(word) for word in text.split(" ")]) # Lemmatizer
    
    lemmatizer = WordNetLemmatizer()
    return df.assign(text=df["text"].apply(clean))

In [6]:
df = sentiment_df
df = tweets_special_clean(df)

### Vectorizer (Runtime: ~30sec)

In [12]:
vectorizer = CountVectorizer(
    strip_accents="unicode",
    stop_words=set(stopwords.words('english')),
    ngram_range=(1,3),
    min_df=500,
    binary=True
)

In [13]:
X = vectorizer.fit_transform(df["text"])

In [14]:
vectorizer.get_feature_names()

['00',
 '000',
 '09',
 '10',
 '100',
 '100 follower',
 '100 follower day',
 '1000',
 '11',
 '12',
 '12 hour',
 '13',
 '14',
 '140',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1st',
 '20',
 '200',
 '2009',
 '21',
 '22',
 '23',
 '24',
 '24 hour',
 '25',
 '26',
 '2day',
 '2moro',
 '2morrow',
 '2nd',
 '2nite',
 '30',
 '30am',
 '33',
 '360',
 '3am',
 '3d',
 '3g',
 '3gs',
 '3rd',
 '40',
 '45',
 '4am',
 '4th',
 '50',
 '500',
 '5am',
 '5th',
 '60',
 '6am',
 '70',
 '7am',
 '80',
 '8am',
 '90',
 '99',
 'able',
 'absolutely',
 'abt',
 'ac',
 'access',
 'accident',
 'according',
 'account',
 'ache',
 'across',
 'act',
 'acting',
 'action',
 'actual',
 'actually',
 'ad',
 'adam',
 'add',
 'add everyone',
 'add everyone train',
 'added',
 'addicted',
 'adding',
 'address',
 'admit',
 'adorable',
 'adventure',
 'advice',
 'afford',
 'afraid',
 'afternoon',
 'age',
 'ago',
 'agree',
 'agreed',
 'ah',
 'ah well',
 'aha',
 'ahaha',
 'ahead',
 'ahh',
 'ahhh',
 'ahhhh',
 'aim',
 'aint',
 'air',
 'air france',
 

In [15]:
X

<1600000x3156 sparse matrix of type '<class 'numpy.int64'>'
	with 9383891 stored elements in Compressed Sparse Row format>

### Merge things back

In [16]:
pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names())

MemoryError: Unable to allocate 37.6 GiB for an array with shape (1600000, 3156) and data type int64