# Data Preprocessing - Sentiment Analysis - Big Richard Club

### Imports

In [None]:
import pandas as pd
import numpy as np
import re 
import nltk 
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import collections

from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches

plt.style.use('ggplot')
%matplotlib inline
pd.options.mode.chained_assignment = None  # default='warn'

## I. Import Data

In [None]:
df_train = pd.read_csv('data/Corona_NLP_train.csv' , encoding = 'latin_1')
df_val = pd.read_csv('data/Corona_NLP_test.csv' , encoding = 'latin_1')

In [None]:
df_train.head()

In [None]:
df_val.head()

In [None]:
training = df_train[['OriginalTweet', 'Sentiment']]
validation = df_val[['OriginalTweet', 'Sentiment']]

#### Checking for Missing Values 

In [None]:
training.isnull().sum()

In [None]:
validation.isnull().sum()

## II. Descriptive Statistics

In [None]:
sns.catplot(x = 'Sentiment', kind = 'count', data = training, height = 5, aspect = 2)

In [None]:
sns.catplot(x = 'Sentiment', kind = 'count', data = validation, height = 5, aspect = 2)

## III. Cleaning the Comments

In [None]:
training.rename({"OriginalTweet": "text", "Sentiment": "label"}, axis="columns", inplace=True)
validation.rename({"OriginalTweet": "text", "Sentiment": "label"}, axis="columns", inplace=True)

#### From 5 classes to 3

In [None]:
# Data has 5 classes, let's convert them to 3

def classes_def(x):
    '''
    Makes the label variable have 3 classes instead of 5
    '''
    
    if x ==  "Extremely Positive":
        return "2"
    elif x == "Extremely Negative":
        return "0"
    elif x == "Negative":
        return "0"
    elif x ==  "Positive":
        return "2"
    else:
        return "1"

training['label'] = training['label'].apply(lambda x:classes_def(x))
validation['label'] = validation['label'].apply(lambda x:classes_def(x))

training.label.value_counts(normalize= True)

#### Remove useless characters

In [None]:
def remove_urls(text):
    url_remove = re.compile(r'https?://\S+|www\.\S+')
    return url_remove.sub(r'', text)
    
training["text"] = training['text'].apply(lambda x:remove_urls(x))
validation["text"] = validation['text'].apply(lambda x:remove_urls(x))

In [None]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

training['text'] = training['text'].apply(lambda x:remove_html(x))
validation['text'] = validation['text'].apply(lambda x:remove_html(x))

In [None]:
# Lower Casing

def lower(text):
    low_text = text.lower()
    return low_text

training['text'] = training['text'].apply(lambda x:lower(x))
validation['text'] = validation['text'].apply(lambda x:lower(x))

In [None]:
# Remove Numbers

def remove_num(text):
    remove = re.sub(r'\d+', '', text)
    return remove

training['text'] = training['text'].apply(lambda x:remove_num(x))
validation['text'] = validation['text'].apply(lambda x:remove_num(x))

In [None]:
# Remove Punctuation

def punct_remove(text):
    punct = re.sub(r"[^\w\s\d]","", text)
    return punct

training['text'] = training['text'].apply(lambda x:punct_remove(x))
validation['text'] = validation['text'].apply(lambda x:punct_remove(x))

In [None]:
# Remove Stopwords

nltk.download('stopwords')
",".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

training['text'] = training['text'].apply(lambda x:remove_stopwords(x))
validation['text'] = validation['text'].apply(lambda x:remove_stopwords(x))

In [None]:
# Remove Mentions "@"

def remove_mention(x):
    text = re.sub(r'@\w+','',x)
    return text

training['text'] = training['text'].apply(lambda x:remove_mention(x))
validation['text'] = validation['text'].apply(lambda x:remove_mention(x))

In [None]:
# Remove Hashtags

def remove_hash(x):
    text = re.sub(r'#\w+','',x)
    return text

training['text'] = training['text'].apply(lambda x:remove_hash(x))
validation['text'] = validation['text'].apply(lambda x:remove_hash(x))

In [None]:
#Remove extra white space left while removing stuff

def remove_space(text):
    space_remove = re.sub(r"\s+"," ",text).strip()
    return space_remove

training['text'] = training['text'].apply(lambda x:remove_space(x))
validation['text'] = validation['text'].apply(lambda x:remove_space(x))

In [None]:
more_stop_words = ['a', 'and', 'the', 'i', 'me', 'my', 'we', 'ours', 'he', 'his', 'her', 'what', 'am',
              'have', 'has', 'had', 'be', 'was', 'been', 'of', 'at', 'for', 'to', 'your', 'is']

def remove_more_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in more_stop_words])

training['text'] = training['text'].apply(lambda x:remove_more_stopwords(x))
validation['text'] = validation['text'].apply(lambda x:remove_more_stopwords(x))

In [None]:
# Tokenize

def tokenize(text):
    text = text.split()
    return text

training['text'] = training['text'].apply(lambda x:tokenize(x))
validation['text'] = validation['text'].apply(lambda x:tokenize(x))

## IV. Bag of Words Representation

In [None]:
validation.shape

In [None]:
x_train = training['text']
y_train = training['label']

x_val = validation['text']
y_val = validation['label']

In [None]:
y_val_array = np.asarray(y_val.values).astype("float")
y_val_array

# RUN ONLY ONCE
# np.savetxt("data/y_val.txt.gz", y_val_array) # Save into a file / .gz compresses the file

In [None]:
y_train_array = np.asarray(y_train.values).astype("float")
y_train_array

# RUN ONLY ONCE
# np.savetxt("data/y_train.txt.gz", y_train_array) # Save into a file / .gz compresses the file

### A) Vocabulary

In [None]:
vocabulary = collections.Counter()
len_x_train = len(x_train)

for i in range(len_x_train):
    words = x_train[i]    
    vocabulary.update(words)

# RUN ONLY ONCE
    
# Save vocabulary in "lesser_vocabulary.txt" file
#f = open("data/lesser_vocabulary.txt", "w", encoding="utf-8")

#for word, count in vocabulary.most_common(1000): # 1000 most common words
#    print(word, file=f)
#f.close()

In [None]:
def load_vocabulary(filename):
    f = open(filename, encoding="utf-8")
    text = f.read()
    f.close()
    words = text.split()
    
    # Create index for each word
    voc = {}
    index = 0
    for word in words:
        voc[word] = index
        index += 1
    
    return voc

In [None]:
vocabulary = load_vocabulary("data/lesser_vocabulary.txt")
list(vocabulary.items())[:5]

### B) Comment as a BoW

#### Example of a BoW for the 3rd comment

In [None]:
x_train[2]

In [None]:
words = x_train[2]
    
bow = np.zeros(len(vocabulary))
for word in words:
    if word in vocabulary:
        index = vocabulary[word]
        bow[index] += 1

In [None]:
def read_comment_bow(comment, voc):
    words = x_train[comment]
    
    # Bag of Words
    bow = np.zeros(len(voc))
    for word in words:
        if word in voc:
            index = voc[word]
            bow[index] += 1

    return bow

#### Training Data

In [None]:
comments = []

for i in range(len(x_train)):
    bow = read_comment_bow(comment=i, voc=vocabulary)
    
    comments.append(bow)

x_train_bow = np.stack(comments)

# RUN ONLY ONCE
# np.savetxt("data/lesser_x_train_bow.txt.gz", x_train_bow) # Save into a file / .gz compresses the file

#### Testing Data

In [None]:
val_comments = []

for i in range(len(x_val)):
    val_bow = read_comment_bow(comment=i, voc=vocabulary)
    
    val_comments.append(val_bow)

x_val_bow = np.stack(val_comments)

# RUN ONLY ONCE
# np.savetxt("data/lesser_x_val_bow.txt.gz", x_val_bow) # Save into a file / .gz compresses the file