In [1]:
# Max Todd
# Data preprocessing

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

nltk.download('stopwords')
nltk.download('punkt')




True

Explore the Dataset

In [2]:
# Load the dataset
trainDF = pd.read_csv('./data/original/train.csv', encoding='unicode_escape')
testDF = pd.read_csv('./data/original/train.csv', encoding='unicode_escape')

# Preview the dataset
trainDF.head()




       textID  ... Density (P/Km²)
0  cb774db0d1  ...              60
1  549e992a42  ...             105
2  088c60f138  ...              18
3  9642c003ef  ...             164
4  358bd9e861  ...              26

[5 rows x 10 columns]

In [3]:
# Print dataset information
print('Training:')
trainDF.info()
print('\n')
print(f'Test same schema, length = {len(testDF[list(testDF.columns)[0]])}')

Training:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27481 non-null  object 
 4   Time of Tweet     27481 non-null  object 
 5   Age of User       27481 non-null  object 
 6   Country           27481 non-null  object 
 7   Population -2020  27481 non-null  int64  
 8   Land Area (Km²)   27481 non-null  float64
 9   Density (P/Km²)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.1+ MB


Test same schema, length = 27481


Modify the datset

In [None]:
def removeStopWords(text):
    '''
    Remove stop words from the text to reduce bias for them
    ex. "is", "the", "and", etc.
    '''

    # error check - return empty string on non string
    if type(text) is not str:
        return ''

    # remove stop words
    noStopWords = ''
    words = str(text).split()
    for word in words:
        if word not in stopwords.words('english'):
            noStopWords += word + ' '

    return noStopWords.strip()


def normalizeText(text):
    '''
    Remove unecessary characters and normalize to reduce variety
    as much as possible across different messages
    '''
    text = re.sub(r'<.*?>', '', str(text))
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))
    text = re.sub(r'\s+', ' ', str(text)).lower()
    text = re.sub(r'[^\w\s]', '', str(text))
    text = re.sub(r'\s+', ' ', str(text)).strip()
    return text


def tokenizeText(text):
    '''
    Tokenize the text
    '''
    tokens = word_tokenize(str(text))
    return tokens

In [None]:
def loadDataset(path, textColumnName):
    '''
    Load the dataset and return a new dataframe containing the
    preprocessed text data

    Parameters
    ----------
    path: str
     path of the CSV file to load the dataset from
    textColumnName: str
     name of the column to perfrom the text preprocessing to

    Returns
    -------
    pandas.Dataframe
     dataframe of the loaded data from csv file inputted and preprocessed text
    '''

    # Load df
    df = pd.read_csv(path, encoding='unicode_escape')

    # Drop nil / null values
    df.dropna(inplace=True)

    # Preprocess the data with NLP
    print('Normalizing...')
    df['normalizedText'] = df[textColumnName].apply(normalizeText)
    print('Removing stop words...')
    df['normalizeNoStop'] = df['normalizedText'].apply(removeStopWords)
    print('Tokenizing...')
    df['tokenized'] = df['normalizeNoStop'].apply(tokenizeText)

    return df

You can save the dataset obtained from the above function to use for models,
or continue from here.

Normalizing...
Removing stop words...
Tokenizing...
