## Kaggle Competition ##  
Lisa Pink and Miguel Novo Villar - DSCC465: Introduction to Statistical Machine Learning 

Packages

In [2]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt    

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
#from nltk.stem.porter import PorterStemmer

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download("averaged_perceptron_tagger")

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
np.random.seed(265)

DATA CLEANING

In [4]:
#Cleaning the text. 
# Source: https://www.kaggle.com/code/clmentbisaillon/twitter-customer-support-data-cleaning
rare = re.compile(r"\^\S*")
new_line = re.compile(r"\n+\S*")
sig = re.compile(r"-\S*")

#Initial preprocessing function
def preprocessor(data):
    corpus = []
    for i in range(len(data)):
        #remove urls
        tweet= re.sub(r'http\S+', ' ', data["full_text"][i][2:-1]) #[2:-1] removes the b' caracters

        #remove mentions
        tweet = re.sub('@[A-Za-z0–9]+', '', tweet)
        tweet = re.sub("@[\w]*","",tweet)
        
        # # Contractions (Source: https://www.kaggle.com/code/gunesevitan/nlp-with-disaster-tweets-eda-cleaning-and-bert/notebook)
        tweet = re.sub(r"he's", "he is", tweet)
        tweet = re.sub(r"there's", "there is", tweet)
        tweet = re.sub(r"We're", "We are", tweet)
        tweet = re.sub(r"That's", "That is", tweet)
        tweet = re.sub(r"won't", "will not", tweet)
        tweet = re.sub(r"they're", "they are", tweet)
        tweet = re.sub(r"Can't", "Cannot", tweet)
        tweet = re.sub(r"wasn't", "was not", tweet)
        tweet = re.sub(r"don\x89Ûªt", "do not", tweet)
        tweet = re.sub(r"aren't", "are not", tweet)
        tweet = re.sub(r"isn't", "is not", tweet)
        tweet = re.sub(r"What's", "What is", tweet)
        tweet = re.sub(r"haven't", "have not", tweet)
        tweet = re.sub(r"hasn't", "has not", tweet)
        tweet = re.sub(r"There's", "There is", tweet)
        tweet = re.sub(r"He's", "He is", tweet)
        tweet = re.sub(r"It's", "It is", tweet)
        tweet = re.sub(r"You're", "You are", tweet)
        tweet = re.sub(r"I'M", "I am", tweet)
        tweet = re.sub(r"shouldn't", "should not", tweet)
        tweet = re.sub(r"wouldn't", "would not", tweet)
        tweet = re.sub(r"i'm", "I am", tweet)
        tweet = re.sub(r"I\x89Ûªm", "I am", tweet)
        tweet = re.sub(r"I'm", "I am", tweet)
        tweet = re.sub(r"Isn't", "is not", tweet)
        tweet = re.sub(r"Here's", "Here is", tweet)
        tweet = re.sub(r"you've", "you have", tweet)
        tweet = re.sub(r"you\x89Ûªve", "you have", tweet)
        tweet = re.sub(r"we're", "we are", tweet)
        tweet = re.sub(r"what's", "what is", tweet)
        tweet = re.sub(r"couldn't", "could not", tweet)
        tweet = re.sub(r"we've", "we have", tweet)
        tweet = re.sub(r"it\x89Ûªs", "it is", tweet)
        tweet = re.sub(r"doesn\x89Ûªt", "does not", tweet)
        tweet = re.sub(r"It\x89Ûªs", "It is", tweet)
        tweet = re.sub(r"Here\x89Ûªs", "Here is", tweet)
        tweet = re.sub(r"who's", "who is", tweet)
        tweet = re.sub(r"I\x89Ûªve", "I have", tweet)
        tweet = re.sub(r"y'all", "you all", tweet)
        tweet = re.sub(r"can\x89Ûªt", "cannot", tweet)
        tweet = re.sub(r"would've", "would have", tweet)
        tweet = re.sub(r"it'll", "it will", tweet)
        tweet = re.sub(r"we'll", "we will", tweet)
        tweet = re.sub(r"wouldn\x89Ûªt", "would not", tweet)
        tweet = re.sub(r"We've", "We have", tweet)
        tweet = re.sub(r"he'll", "he will", tweet)
        tweet = re.sub(r"Y'all", "You all", tweet)
        tweet = re.sub(r"Weren't", "Were not", tweet)
        tweet = re.sub(r"Didn't", "Did not", tweet)
        tweet = re.sub(r"they'll", "they will", tweet)
        tweet = re.sub(r"they'd", "they would", tweet)
        tweet = re.sub(r"DON'T", "DO NOT", tweet)
        tweet = re.sub(r"That\x89Ûªs", "That is", tweet)
        tweet = re.sub(r"they've", "they have", tweet)
        tweet = re.sub(r"i'd", "I would", tweet)
        tweet = re.sub(r"should've", "should have", tweet)
        tweet = re.sub(r"You\x89Ûªre", "You are", tweet)
        tweet = re.sub(r"where's", "where is", tweet)
        tweet = re.sub(r"Don\x89Ûªt", "Do not", tweet)
        tweet = re.sub(r"we'd", "we would", tweet)
        tweet = re.sub(r"i'll", "I will", tweet)
        tweet = re.sub(r"weren't", "were not", tweet)
        tweet = re.sub(r"They're", "They are", tweet)
        tweet = re.sub(r"Can\x89Ûªt", "Cannot", tweet)
        tweet = re.sub(r"you\x89Ûªll", "you will", tweet)
        tweet = re.sub(r"I\x89Ûªd", "I would", tweet)
        tweet = re.sub(r"let's", "let us", tweet)
        tweet = re.sub(r"it's", "it is", tweet)
        tweet = re.sub(r"can't", "cannot", tweet)
        tweet = re.sub(r"don't", "do not", tweet)
        tweet = re.sub(r"you're", "you are", tweet)
        tweet = re.sub(r"i've", "I have", tweet)
        tweet = re.sub(r"that's", "that is", tweet)
        tweet = re.sub(r"i'll", "I will", tweet)
        tweet = re.sub(r"doesn't", "does not", tweet)
        tweet = re.sub(r"i'd", "I would", tweet)
        tweet = re.sub(r"didn't", "did not", tweet)
        tweet = re.sub(r"ain't", "am not", tweet)
        tweet = re.sub(r"you'll", "you will", tweet)
        tweet = re.sub(r"I've", "I have", tweet)
        tweet = re.sub(r"Don't", "do not", tweet)
        tweet = re.sub(r"I'll", "I will", tweet)
        tweet = re.sub(r"I'd", "I would", tweet)
        tweet = re.sub(r"Let's", "Let us", tweet)
        tweet = re.sub(r"you'd", "You would", tweet)
        tweet = re.sub(r"It's", "It is", tweet)
        tweet = re.sub(r"Ain't", "am not", tweet)
        tweet = re.sub(r"Haven't", "Have not", tweet)
        tweet = re.sub(r"Could've", "Could have", tweet)
        tweet = re.sub(r"youve", "you have", tweet)  
        tweet = re.sub(r"donå«t", "do not", tweet) 
        
        # Character entity references
        tweet = re.sub(r"&gt;", "", tweet)
        tweet = re.sub(r"&lt;", "", tweet)
        tweet = re.sub(r"&amp;", "", tweet)

        # Typos, slang and informal abbreviations
        tweet = re.sub(r"w/e", "whatever", tweet)
        tweet = re.sub(r"w/", "with", tweet)
        tweet = re.sub(r"USAgov", "USA government", tweet)        

        #remove rt
        tweet = re.sub("RT @[\w]*:","",tweet)#removing rt
        tweet = re.sub('RT[\s]+', '', tweet) # Removing RT
        
        #remove emoji
        tweet=re.sub("[^\w\s#@/:%.,_-]", "", tweet, flags=re.UNICODE)#remove emoji
        
        tweet = tweet.replace('x', '')
        
        #remove html tags
        tweet = re.sub(r'<.*?>',' ', tweet) 

        #signatures
        tweet = sig.sub(r'', tweet)

        #rare
        tweet = rare.sub(r'', tweet)

        #new line
        tweet = new_line.sub(r'.', tweet)

        #remove digits
        tweet = re.sub(r'\d+',' ', tweet)
        
        # #remove hashtags
        # tweet = re.sub(r'#\w+',' ', tweet)
        
        #remove white
        tweet = re.sub("^\\s+|\\s+$", "", tweet)  # Remove leading and trailing white space
        #unite multispace
        tweet = ' '.join(tweet.split())
        review = re.sub('[^a-zA-Z]', ' ', tweet)

        review = review.lower()
        review = review.split()

        review = ' '.join(review)
        corpus.append(review)
    return corpus   

#FUNCTIONS APPLIED

def convert_list_of_str_to_list_lists(X):
    return [list(sentence) for sentence in X]

def remove_stopwords(word_tokens):
    stop_words = stopwords.words('english')
    stop_words.extend(["amp","wa","ta","ha","nn","ie","ste"])
    return [w for w in word_tokens if not w.lower() in stop_words]

def tokenize_words(X, tweet_tokenizer=True, rmv_stopwords=True):
    if tweet_tokenizer:
        tokenize = TweetTokenizer().tokenize
    else:
        tokenize = nltk.word_tokenize

    word_tokens = [tokenize(sentence) for sentence in X]

    if rmv_stopwords:
        return [remove_stopwords(tokens) for tokens in word_tokens]
    else:
        return word_tokens

def pos_tagging_words(X):
    return [[nltk.pos_tag(sentence)] for sentence in X]

def lemmatize_words(X, lemmatizer):
    X_lemmatize = list()
    for sentence in X:
        X_lemmatize.append([[lemmatizer.lemmatize(word)] for word in sentence])
    return X_lemmatize

def prepare_tokens_for_vectorize(X):
    X_preproc = list()
    for sentence_tok in X:
        tokens = [tok[0] for tok in sentence_tok]
        X_preproc.append(" ".join(tokens))

    return X_preproc

Load data

In [5]:
data_train = pd.read_csv("training_data.csv")
data_test = pd.read_csv("test_data(1).csv")

  data_train = pd.read_csv("training_data.csv")


In [6]:
data_train

Unnamed: 0,text,reply_to_screen_name,is_quote,is_retweet,hashtags,country
0,Remember the #WuhanCoronaVirus? The pandemic w...,,False,True,WuhanCoronaVirus KillerCuomo,us
1,My sources @WhiteHouse say 2 tactics will be u...,,False,True,Trump,us
2,I'll venture a wild guess: If you were running...,,False,True,COVID19,us
3,#Pakistan (#GreenStimulus = #Nature protection...,,False,True,Pakistan GreenStimulus Nature Green,us
4,🇺🇸 Pandémie de #coronavirus: 30 pasteurs améri...,,False,True,coronavirus COVID__19 COVIDー19,us
...,...,...,...,...,...,...
239995,"Aa Likes, Retweets yentra 🙏\n🔥🔥🔥\n#Mastеr",,TRUE,TRUE,Mastеr,new_zealand
239996,Very interesting\nAny thoughts?\n\n#TheFive #T...,,FALSE,TRUE,TheFive Trump2020 KAG2020 mondaythoughts COVID...,new_zealand
239997,As we deal with #COVID19 don't forget that #Ch...,,TRUE,TRUE,COVID19 Christians persecution Nigeria,new_zealand
239998,"While we hit 150,000 in #COVID19 deaths, the P...",,FALSE,TRUE,COVID19,new_zealand
