# Data Pre-processing - TFIDF - Sentiment Analysis - Big Richard Club

#### Imports

In [1]:
import pandas as pd
import numpy as np

import re 
import nltk 
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

pd.options.mode.chained_assignment = None  # default='warn'

# I. Import Data

In [2]:
df_train = pd.read_csv('data/Corona_NLP_train.csv' , encoding = 'latin_1')
df_val = pd.read_csv('data/Corona_NLP_test.csv' , encoding = 'latin_1')

In [3]:
df_train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [4]:
df_val.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [5]:
training = df_train[['OriginalTweet', 'Sentiment']]
validation = df_val[['OriginalTweet', 'Sentiment']]

## A) Checking for Missing Values 

In [6]:
training.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

In [7]:
validation.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

## B) Renaming Columns and Reducing the Amount of Classes

In [8]:
training.rename({
    "OriginalTweet": "text",
    "Sentiment": "label"
}, axis="columns", inplace=True)

validation.rename({
    "OriginalTweet": "text",
    "Sentiment": "label"
}, axis="columns", inplace=True)

#### From 5 classes to 3

In [9]:
def classes_def(x):
    '''
    Makes the label variable have 3 classes instead of 5
    '''
    
    if x ==  "Extremely Positive":
        return "2"
    elif x == "Extremely Negative":
        return "0"
    elif x == "Negative":
        return "0"
    elif x ==  "Positive":
        return "2"
    else:
        return "1"

training['label'] = training['label'].apply(lambda x:classes_def(x))
validation['label'] = validation['label'].apply(lambda x:classes_def(x))

training.label.value_counts(normalize=True)

2    0.438467
0    0.374128
1    0.187404
Name: label, dtype: float64

# II. Data Cleaning

In [10]:
def tokenize_sentence(df, colname):
    df[colname] = df[colname].str.split()
    return df

In [11]:
def remove_stop_words(df, colname):
    stop_words = stopwords.words('english')
    df[colname] = df[colname].apply(lambda x: [word for word in x if word not in stop_words])
    return df

In [12]:
def reverse_tokenize_sentence(df, colname):
    df[colname] = df[colname].map(lambda word: ' '.join(word))
    return df

In [13]:
def big_removal(df, column):
    """
    Takes in a string of text, lowercases it and removes the following elements:
    1. URLs
    2. Mentions "@" and Usernames
    3. HTML
    4. Numbers
    5. Punctuation
    6. Hashtags
    Optional : 7. Extra Space
    """
    
    df[column] = df[column].str.lower()
    
    for i in range(len(df)):
        df[column][i] = re.sub(r'https?://\S+|www\.\S+', "", df[column][i]) # urls
        df[column][i] = re.sub(r"@\w+", "", df[column][i]) # mentions
        df[column][i] = re.sub(r"<.*?>", "", df[column][i]) # html
        df[column][i] = re.sub(r"\d+", "", df[column][i]) # numbers
        df[column][i] = re.sub(r"[^\w\s\d]", "", df[column][i]) # punctuation
        df[column][i] = re.sub(r"#\w+", "", df[column][i]) # hashtags
        #df[column][i] = re.sub(r"\s+", "", df[column][i]) # extra space
    
    return df

In [14]:
def text_cleaning(df, colname):
    """
    Takes in a string of text, then performs the following:
    1. Tokenize sentences
    2. Remove all stopwords
    3. convert tokenized text to text
    """
    
    df = (
        df
        .pipe(tokenize_sentence, colname)
        .pipe(remove_stop_words, colname)
        .pipe(reverse_tokenize_sentence, colname)
    )
    
    return df

In [15]:
big_removal(training, "text")

Unnamed: 0,text,label
0,and and,1
1,advice talk to your neighbours family to excha...,2
2,coronavirus australia woolworths to give elder...,2
3,my food stock is not the only one which is emp...,2
4,me ready to go at supermarket during the covid...,0
...,...,...
41152,airline pilots offering to stock supermarket s...,1
41153,response to complaint not provided citing covi...,0
41154,you know itâs getting tough when is rationin...,2
41155,is it wrong that the smell of hand sanitizer i...,1


In [16]:
text_cleaning(training, "text")

Unnamed: 0,text,label
0,,1
1,advice talk neighbours family exchange phone n...,2
2,coronavirus australia woolworths give elderly ...,2
3,food stock one empty please dont panic enough ...,2
4,ready go supermarket covid outbreak im paranoi...,0
...,...,...
41152,airline pilots offering stock supermarket shel...,1
41153,response complaint provided citing covid relat...,0
41154,know itâs getting tough rationing toilet paper...,2
41155,wrong smell hand sanitizer starting turn coron...,1


# III. TFIDF Representation

In [17]:
x_train = training['text']
x_val = validation['text']

In [18]:
# Vocabulary of 1000 words

number_of_dimensions = 1000

tfidf_vectorizer = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 1),
    max_features=number_of_dimensions,
    max_df=10000000,
    min_df=1
).fit(x_train)

In [19]:
vectorized_xtrain = tfidf_vectorizer.transform(x_train).toarray()
vectorized_xval = tfidf_vectorizer.transform(x_val).toarray()

# RUN ONLY ONCE
# np.savetxt("data/tfidf_x_train.txt.gz", vectorized_xtrain) # Save into a file / .gz compresses the file
# np.savetxt("data/tfidf_x_val.txt.gz", vectorized_xval) # Save into a file / .gz compresses the file

In [20]:
print(
    f"TFIDF x_train shape : {vectorized_xtrain.shape}", "\n"
    f"TFIDF x_val shape : {vectorized_xval.shape}"
)

TFIDF x_train shape : (41157, 1000) 
TFIDF x_val shape : (3798, 1000)
