In [1]:
import h2o
from h2o.estimators.word2vec import H2OWord2vecEstimator
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import pandas as pd
import numpy as np
import re
import readability
import os
from functools import partial


In [2]:
# Preparations
# Change directory path to get the file
#os.chdir('/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/')
os.chdir('C:\\Users\\mikec\\Documents')
# Opening the slang text file for counting number of slang words
with open('slang.txt') as file:
    slang_map = dict(map(str.strip, line.partition('\t')[::2])
    for line in file if line.strip())
slang_words = sorted(slang_map, key=len, reverse=True)
regex = re.compile(r"\b({})\b".format("|".join(map(re.escape, slang_words))))
replaceSlang = partial(regex.sub, lambda m: slang_map[m.group(1)])

''' This is for h2o word2vec!!
# Prepare stopwords, rt is refers to ReTweet
STOP_WORDS = set(stopwords.words('english'))
new_stopwords = ['rt'] 
STOP_WORDS = STOP_WORDS.union(new_stopwords)
'''


" This is for h2o word2vec!!\n# Prepare stopwords, rt is refers to ReTweet\nSTOP_WORDS = set(stopwords.words('english'))\nnew_stopwords = ['rt'] \nSTOP_WORDS = STOP_WORDS.union(new_stopwords)\n"

In [3]:
# All Feature Extraction Functions
''' This word2vec includes nan values.
# functions for H2o Word2Vec
def tokenizeFunc(sentences, stop_word = STOP_WORDS):
    df = sentences.as_data_frame()
    df = df.astype(str)
    sentence = h2o.H2OFrame(python_obj=df, column_types=["string"])
    tokenized = sentence.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

def h2o_w2vec(data, str):
    print("Break " + str + " into sequence of words")
    words = tokenizeFunc(data)
    print("Build word2vec model for " + str)
    w2v_model = H2OWord2vecEstimator(sent_sample_rate=0.0, epochs=10)
    w2v_model.train(training_frame=words)
    vecs = w2v_model.transform(words, aggregate_method="AVERAGE")
    return vecs
'''

# NGrams Freq function
def ngrams(text, min_n, max_n, str):
    print("Completing ngram generation for " + str)
    bv = CountVectorizer(ngram_range=(min_n, max_n), max_features=1000)
    bv_matrix = bv.fit_transform(text).toarray()
    bv_vocab = bv.get_feature_names()
    bv_data = pd.DataFrame(bv_matrix, columns=bv_vocab)
    bv_data.columns = [col + '_nw' for col in bv_data.columns]
    return bv_data

# Char-NGrams Freq function
def char_ngrams(text, min_n, max_n, str):
    print("Completing char-ngram generation for " + str)
    bv = CountVectorizer(ngram_range=(min_n, max_n), max_features=1000, analyzer='char_wb')
    bv_matrix = bv.fit_transform(text).toarray()
    bv_vocab = bv.get_feature_names()
    bv_data = pd.DataFrame(bv_matrix, columns=bv_vocab)
    bv_data.columns = [col + '_nc' for col in bv_data.columns]
    return bv_data

# TFIDF function
def tfidf(text, min_n, max_n, str):
    print("Completing tfidf+ngram generation for " + str)
    tv = TfidfVectorizer(ngram_range=(min_n, max_n), max_features=1000)
    tv_matrix = tv.fit_transform(text).toarray()
    tv_vocab = tv.get_feature_names()
    tv_data = pd.DataFrame(np.round(tv_matrix, 2), columns=tv_vocab)
    tv_data.columns = [col + '_tw' for col in tv_data.columns]
    return tv_data

# Char-TFIDF function
def char_tfidf(text, min_n, max_n, str):
    print("Completing char-tfidf+ngram generation for " + str)
    tv = TfidfVectorizer(ngram_range=(min_n, max_n), max_features=1000, analyzer='char_wb')
    tv_matrix = tv.fit_transform(text).toarray()
    tv_vocab = tv.get_feature_names()
    tv_data = pd.DataFrame(np.round(tv_matrix, 2), columns=tv_vocab)
    tv_data.columns = [col + '_tc' for col in tv_data.columns]
    return tv_data


# Sentiment Analysis function
def sentimentAnalyzer(tweets, str):
    sid = SentimentIntensityAnalyzer()
    sentiment_val = pd.DataFrame(columns = ['sentiment'])
    print("Completing the sentiment analysis for " + str)
    for i in range(0, len(tweets)):
        ss = sid.polarity_scores(tweets[i])
        sentiment_val.at[i, 'sentiment'] = ss.get('compound')
    return sentiment_val

# Linguistic Feature Extraction 
def text_length(text):
    return len(text)

def number_of_tokens(text):
    tokens = nltk.word_tokenize(text)
    return len(tokens)

def is_retweet(text):
    tokens = nltk.word_tokenize(text)
    if 'RT' in tokens:
        return 1
    else:
        return 0

def number_of_mentions(text):
    return len(re.findall(r"@\S+", text))

def number_of_hashtags(text):
    return len(re.findall(r"#\S+", text))

def number_of_links(text):
    return len(re.findall(r"http\S+", text))

def number_of_elongated(text):
    regex = re.compile(r"(.)\1{2}")
    return len([word for word in text.split() if regex.search(word)])

def number_of_slangs(text):
    slang_counter = 0
    tokens = nltk.word_tokenize(text)
    for word in tokens:
        if word in slang_words:
            slang_counter += 1
    return slang_counter

def number_of_emoticons(text):
    return len(re.findall(r"&#\S+", text))

def linguisticFeatures(tweets, str):
    print("Completing the liguistic feature extraction for " + str)
    tl, irt, nom, noh, nol, noem, nt, noel, nos = [], [], [], [], [], [], [], [], []
    for i in range(0, len(tweets)):
        tl.append(text_length(tweets[i]))
        irt.append(is_retweet(tweets[i]))
        nom.append(number_of_mentions(tweets[i]))
        noh.append(number_of_hashtags(tweets[i]))
        nol.append(number_of_links(tweets[i]))
        noem.append(number_of_emoticons(tweets[i]))
        nt.append(number_of_tokens(tweets[i]))
        noel.append(number_of_elongated(tweets[i]))
        nos.append(number_of_slangs(tweets[i]))
    features = pd.DataFrame()
    features['text length'] = tl
    features['number of words'] = nt
    features['retweet'] = irt
    features['number of mentions'] = nom
    features['number of hashtags'] = noh
    features['number of links'] = nol
    features['number of elongated'] = noel
    features['number of slangs'] = nos
    features['number of emoticons'] = noem
    return features

# Readability score extraction
def remove_user_names(text):
    text = re.sub(r'@\S+', '', text)
    return text

def remove_hashtags(text):
    text = re.sub(r'#\S+', '', text)
    return text

def remove_links(text):
    text = re.sub(r'http\S+', '', text)
    return text

def remove_underscore(text):
    text = text.replace('_', '')
    return text

def remove_emojis(text):
    text = re.sub(r'\&#S+', '', text)
    return text

def readabilityScores(tweets, str):
    print("Completing the readability scores extraction for " + str)
    fkg, ari, cli, fre, gfi, lix, si, rix, dci = [], [], [], [], [], [], [], [], []
    cpw, spw, wps, ttr, c, s, w, wt, lw, cw, cwdc = [], [], [], [], [], [], [], [], [], [], []
    for i in range(0, len(tweets)):
        # Readability
        tweet = tweets[i]
        tweet = remove_user_names(tweet)
        tweet = remove_hashtags(tweet)
        tweet = remove_links(tweet)
        tweet = remove_underscore(tweet)
        tweet = remove_emojis(tweet)
        measures = readability.getmeasures(tweet, lang='en')
        fkg.append(measures['readability grades']['Kincaid'])
        ari.append(measures['readability grades']['ARI'])
        cli.append(measures['readability grades']['Coleman-Liau'])
        fre.append(measures['readability grades']['FleschReadingEase'])
        gfi.append(measures['readability grades']['GunningFogIndex'])
        lix.append(measures['readability grades']['LIX'])
        si.append(measures['readability grades']['SMOGIndex'])
        rix.append(measures['readability grades']['RIX'])
        dci.append(measures['readability grades']['DaleChallIndex'])
        # Sentence
        cpw.append(measures['sentence info']['characters_per_word'])
        spw.append(measures['sentence info']['syll_per_word'])
        wps.append(measures['sentence info']['words_per_sentence'])
        ttr.append(measures['sentence info']['type_token_ratio'])
        c.append(measures['sentence info']['characters'])
        s.append(measures['sentence info']['syllables'])
        w.append(measures['sentence info']['words'])
        wt.append(measures['sentence info']['wordtypes'])
        lw.append(measures['sentence info']['long_words'])
        cw.append(measures['sentence info']['complex_words'])
        cwdc.append(measures['sentence info']['complex_words_dc'])
    
    features = pd.DataFrame()
    # Readability
    features['Kincaid'] = fkg
    features['ARI'] = ari
    features['Coleman-Liau'] = cli
    features['FleschReadingEase'] = fre
    features['GunningFogIndex'] = gfi
    features['LIX'] = lix
    features['SMOGIndex'] = si
    features['RIX'] = rix
    features['DaleChallIndex'] = dci
    # Sentence
    features['Characters per word'] = cpw
    features['Syllables per word'] = spw
    features['Words per sentence'] = wps
    features['Type toke ratio'] = ttr
    features['Characters'] = c
    features['Syllables'] = s
    features['Words'] = w
    features['Wordtypes'] = wt
    features['Long words'] = lw
    features['Complex words'] = cw
    features['Complex words dc'] = cwdc
    return features

In [None]:
# create an h2o instance
#h2o.init()
#nltk.download('stopwords')  # might need if running nltk + stopwords for the first time

In [4]:
# prepare data for feature extraction, check if paths are correct
filepath_train = "EnglishCleanedTrainingData.csv"
filepath_test = "EnglishCleanedTestingData.csv"

#train_data_h2o = h2o.upload_file(filepath_train) 
#test_data_h2o = h2o.upload_file(filepath_test)

train_data = pd.read_csv(filepath_train)
test_data = pd.read_csv(filepath_test)
train_labels = train_data["labels"]
test_labels = test_data["labels"]

In [5]:
# Word2Vec generation resulting in pandas data frames
#vecs_train = (h2o_w2vec(train_data_h2o['cleaned_tweet'], 'train')).as_data_frame()
#vecs_test = (h2o_w2vec(test_data_h2o['cleaned_tweet'], 'test')).as_data_frame()

In [6]:
# NGRAMS generation + Frequency calculation -- NOTE no header column
ngram_train = ngrams(train_data['cleaned_tweet'], 1, 3, 'train') 
ngram_test = ngrams(test_data['cleaned_tweet'], 1, 3, 'test')
# Char-NGRAMS generation + Frequency calculation -- NOTE no header column
char_ngram_train = char_ngrams(train_data['cleaned_tweet'], 2, 5, 'train') 
char_ngram_test = char_ngrams(test_data['cleaned_tweet'], 2, 5, 'test')

Completing ngram generation for train
Completing ngram generation for test
Completing char-ngram generation for train
Completing char-ngram generation for test


In [7]:
# TFIDF Generation -- NOTE no header column
tfidf_train = tfidf(train_data['cleaned_tweet'], 1, 3, 'train')
tfidf_test = tfidf(test_data['cleaned_tweet'], 1, 3, 'test')
# TFIDF Generation -- NOTE no header column
char_tfidf_train = char_tfidf(train_data['cleaned_tweet'], 2, 5, 'train')
char_tfidf_test = char_tfidf(test_data['cleaned_tweet'], 2, 5, 'test')

Completing tfidf+ngram generation for train
Completing tfidf+ngram generation for test
Completing char-tfidf+ngram generation for train
Completing char-tfidf+ngram generation for test


In [8]:
# Sentiment Analysis into pandas dataframes
sentiment_train = sentimentAnalyzer(train_data['cleaned_tweet'], 'train') 
sentiment_test = sentimentAnalyzer(test_data['cleaned_tweet'], 'test')

Completing the sentiment analysis for train
Completing the sentiment analysis for test


In [9]:
# Liguistic feature extraction
linguistic_train = linguisticFeatures(train_data['tweet'], 'train')
linguistic_test = linguisticFeatures(test_data['tweet'], 'test')

Completing the liguistic feature extraction for train
Completing the liguistic feature extraction for test


In [10]:
# Readability scores
readability_train = readabilityScores(train_data['tweet'], 'train')
readability_test = readabilityScores(test_data['tweet'], 'test')

Completing the readability scores extraction for train
Completing the readability scores extraction for test


In [11]:
# combine all features into one pandas dataframe for train and test
print("Concatenating in process")
training_data = pd.concat([ngram_train, char_ngram_train, tfidf_train, char_tfidf_train, sentiment_train, 
                           linguistic_train, readability_train, train_labels], axis=1)

testing_data = pd.concat([ngram_test, char_ngram_test, tfidf_test, char_tfidf_test, sentiment_test, 
                           linguistic_test, readability_test, test_labels], axis=1) 
print("Did it!")


Concatenating in process
Did it!


In [12]:
def getTrainData():
    return training_data
def getTestData():
    return testing_data