# Data Preprocessing - Sentiment Analysis - Big Richard Club

### Imports

In [25]:
import pandas as pd
import numpy as np
import re 
import nltk 
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import collections

from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches

plt.style.use('ggplot')
%matplotlib inline
pd.options.mode.chained_assignment = None  # default='warn'

## I. Import Data

In [26]:
df_train = pd.read_csv('data/Corona_NLP_train.csv' , encoding = 'latin_1')
df_val = pd.read_csv('data/Corona_NLP_test.csv' , encoding = 'latin_1')

In [27]:
df_train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [28]:
df_val.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [29]:
training = df_train[['OriginalTweet', 'Sentiment']]
validation = df_val[['OriginalTweet', 'Sentiment']]

#### Checking for Missing Values 

In [30]:
training.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

In [31]:
validation.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

## III. Cleaning the Comments

In [32]:
training.rename({"OriginalTweet": "text", "Sentiment": "label"}, axis="columns", inplace=True)
validation.rename({"OriginalTweet": "text", "Sentiment": "label"}, axis="columns", inplace=True)

#### From 5 classes to 3

In [33]:
# Data has 5 classes, let's convert them to 3

def classes_def(x):
    '''
    Makes the label variable have 3 classes instead of 5
    '''
    
    if x ==  "Extremely Positive":
        return "2"
    elif x == "Extremely Negative":
        return "0"
    elif x == "Negative":
        return "0"
    elif x ==  "Positive":
        return "2"
    else:
        return "1"

training['label'] = training['label'].apply(lambda x:classes_def(x))
validation['label'] = validation['label'].apply(lambda x:classes_def(x))

training.label.value_counts(normalize= True)

2    0.438467
0    0.374128
1    0.187404
Name: label, dtype: float64

#### Remove useless characters

In [34]:
def remove_urls(text):
    url_remove = re.compile(r'https?://\S+|www\.\S+')
    return url_remove.sub(r'', text)
    
training["text"] = training['text'].apply(lambda x:remove_urls(x))
validation["text"] = validation['text'].apply(lambda x:remove_urls(x))

In [35]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

training['text'] = training['text'].apply(lambda x:remove_html(x))
validation['text'] = validation['text'].apply(lambda x:remove_html(x))

In [36]:
# Remove Mentions "@"

def remove_mention(x):
    text = re.sub(r'@\w+', '', x)
    return text

training['text'] = training['text'].apply(lambda x:remove_mention(x))
validation['text'] = validation['text'].apply(lambda x:remove_mention(x))

In [37]:
# Remove Hashtags

def remove_hash(x):
    text = re.sub(r'#\w+','',x)
    return text

training['text'] = training['text'].apply(lambda x:remove_hash(x))
validation['text'] = validation['text'].apply(lambda x:remove_hash(x))

In [38]:
# Lower Casing

def lower(text):
    low_text = text.lower()
    return low_text

training['text'] = training['text'].apply(lambda x:lower(x))
validation['text'] = validation['text'].apply(lambda x:lower(x))

In [39]:
# Remove Numbers

def remove_num(text):
    remove = re.sub(r'\d+', '', text)
    return remove

training['text'] = training['text'].apply(lambda x:remove_num(x))
validation['text'] = validation['text'].apply(lambda x:remove_num(x))

In [40]:
# Remove Punctuation

def punct_remove(text):
    punct = re.sub(r"[^\w\s\d]","", text)
    return punct

training['text'] = training['text'].apply(lambda x:punct_remove(x))
validation['text'] = validation['text'].apply(lambda x:punct_remove(x))

In [41]:
# Remove Stopwords

nltk.download('stopwords')
",".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

training['text'] = training['text'].apply(lambda x:remove_stopwords(x))
validation['text'] = validation['text'].apply(lambda x:remove_stopwords(x))

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [42]:
#Remove extra white space left while removing stuff

def remove_space(text):
    space_remove = re.sub(r"\s+"," ",text).strip()
    return space_remove

training['text'] = training['text'].apply(lambda x:remove_space(x))
validation['text'] = validation['text'].apply(lambda x:remove_space(x))

In [43]:
more_stop_words = ['a', 'and', 'the', 'i', 'me', 'my', 'we', 'ours', 'he', 'his', 'her', 'what', 'am',
              'have', 'has', 'had', 'be', 'was', 'been', 'of', 'at', 'for', 'to', 'your', 'is']

def remove_more_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in more_stop_words])

training['text'] = training['text'].apply(lambda x:remove_more_stopwords(x))
validation['text'] = validation['text'].apply(lambda x:remove_more_stopwords(x))

In [44]:
# Tokenize

def tokenize(text):
    text = text.split()
    return text

training['text'] = training['text'].apply(lambda x:tokenize(x))
validation['text'] = validation['text'].apply(lambda x:tokenize(x))

In [45]:
def reverse_tokenize_sentence(df, colname):
    df[colname] = df[colname].map(lambda word: ' '.join(word))
    return df

training = reverse_tokenize_sentence(training, "text")
validation = reverse_tokenize_sentence(validation, "text")

## IV. TFIDF

In [46]:
validation.shape

(3798, 2)

In [23]:
x_train = training['text']
y_train = training['label']

x_val = validation['text']
y_val = validation['label']

In [47]:
# vectorize first 1000 tweets
number_of_dimensions = 1000

tfidf_vectorizer = TfidfVectorizer(
analyzer='word',
ngram_range=(1, 1),
max_features=number_of_dimensions,
max_df=10000000,
min_df=1)

tfidf_vectorizer.fit(x_train)

TfidfVectorizer(max_df=10000000, max_features=1000)

In [53]:
vectorized_xtrain = tfidf_vectorizer.transform(x_train).toarray()
vectorized_xval = tfidf_vectorizer.transform(x_val).toarray()

# RUN ONLY ONCE
# np.savetxt("data/tfidf_x_train.txt.gz", vectorized_xtrain) # Save into a file / .gz compresses the file
# np.savetxt("data/tfidf_x_val.txt.gz", vectorized_xval) # Save into a file / .gz compresses the file