In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import pandas as pd
import numpy as np
import re
import readability
import os
from polyglot.text import Text

In [None]:
# Preparations
# Change directory path to get the file
#os.chdir('/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/')
os.chdir('C:\\Users\\mikec\\Documents')

In [None]:
# All Feature Extraction Functions
# NGrams Freq function
def ngrams(text, min_n, max_n, str):
    print("Completing ngram generation for " + str)
    bv = CountVectorizer(ngram_range=(min_n, max_n), max_features=1000)
    bv_matrix = bv.fit_transform(text).toarray()
    bv_vocab = bv.get_feature_names()
    bv_data = pd.DataFrame(bv_matrix, columns=bv_vocab)
    bv_data.columns = [col + '_nw' for col in bv_data.columns]
    return bv_data

# Char-NGrams Freq function
def char_ngrams(text, min_n, max_n, str):
    print("Completing char-ngram generation for " + str)
    bv = CountVectorizer(ngram_range=(min_n, max_n), max_features=1000, analyzer='char_wb')
    bv_matrix = bv.fit_transform(text).toarray()
    bv_vocab = bv.get_feature_names()
    bv_data = pd.DataFrame(bv_matrix, columns=bv_vocab)
    bv_data.columns = [col + '_nc' for col in bv_data.columns]
    return bv_data

# TFIDF function
def tfidf(text, min_n, max_n, str):
    print("Completing tfidf+ngram generation for " + str)
    tv = TfidfVectorizer(ngram_range=(min_n, max_n), max_features=1000)
    tv_matrix = tv.fit_transform(text).toarray()
    tv_vocab = tv.get_feature_names()
    tv_data = pd.DataFrame(np.round(tv_matrix, 2), columns=tv_vocab)
    tv_data.columns = [col + '_tw' for col in tv_data.columns]
    return tv_data

# Char-TFIDF function
def char_tfidf(text, min_n, max_n, str):
    print("Completing char-tfidf+ngram generation for " + str)
    tv = TfidfVectorizer(ngram_range=(min_n, max_n), max_features=1000, analyzer='char_wb')
    tv_matrix = tv.fit_transform(text).toarray()
    tv_vocab = tv.get_feature_names()
    tv_data = pd.DataFrame(np.round(tv_matrix, 2), columns=tv_vocab)
    tv_data.columns = [col + '_tc' for col in tv_data.columns]
    return tv_data

# Sentiment Analysis function
def sentimentAnalyzer(tweets, str):
    sid = SentimentIntensityAnalyzer()
    sentiment_val = pd.DataFrame(columns = ['sentiment'])
    print("Completing the sentiment analysis for " + str)
    for i in range(0, len(tweets)):
        ss = sid.polarity_scores(tweets[i])
        sentiment_val.at[i, 'sentiment'] = ss.get('compound')
    return sentiment_val

# Linguistic Feature Extraction 
def text_length(text):
    return len(text)

def number_of_tokens(text):
    tokens = nltk.word_tokenize(text)
    return len(tokens)

def number_of_mentions(text):
    return len(re.findall(r"@\S+", text))

def number_of_hashtags(text):
    return len(re.findall(r"#\S+", text))

def number_of_links(text):
    return len(re.findall(r"http\S+", text))

def number_of_emoticons(text):
    count = 0
    for character in text:
        if character in emoji.UNICODE_EMOJI:
            count += 1
    return count

def linguisticFeatures(tweets, str):
    print("Completing the liguistic feature extraction for " + str)
    tl, nt, nom, noh, nol, noem = [], [], [], [], [], []
    for i in range(0, len(tweets)):
        tl.append(text_length(tweets[i]))
        nom.append(number_of_mentions(tweets[i]))
        noh.append(number_of_hashtags(tweets[i]))
        nol.append(number_of_links(tweets[i]))
        noem.append(number_of_emoticons(tweets[i]))
        nt.append(number_of_tokens(tweets[i]))
    features = pd.DataFrame()
    features['text length'] = tl
    features['number of words'] = nt
    features['number of mentions'] = nom
    features['number of hashtags'] = noh
    features['number of links'] = nol
    features['number of emoticons'] = noem
    return features

# Readability score extraction
def remove_user_names(text):
    text = re.sub(r'@\S+', '', text)
    return text

def remove_hashtags(text):
    text = re.sub(r'#\S+', '', text)
    return text

def remove_links(text):
    text = re.sub(r'http\S+', '', text)
    return text

def remove_underscore(text):
    text = text.replace('_', '')
    return text

def remove_emojis(text):
    return ''.join(c for c in text if c not in emoji.UNICODE_EMOJI)

def readabilityScores(tweets, str):
    print("Completing the readability scores extraction for " + str)
    fkg, ari, cli, fre, gfi, lix, si, rix, dci = [], [], [], [], [], [], [], [], []
    cpw, spw, wps, ttr, c, s, w, wt, lw, cw, cwdc = [], [], [], [], [], [], [], [], [], [], []
    for i in range(0, len(tweets)):
        tweet = tweets[i]
        tweet = remove_user_names(tweet)
        tweet = remove_hashtags(tweet)
        tweet = remove_links(tweet)
        tweet = remove_underscore(tweet)
        tweet = remove_emojis(tweet)
        measures = readability.getmeasures(tweet, lang='de')
        fkg.append(measures['readability grades']['Kincaid'])
        ari.append(measures['readability grades']['ARI'])
        cli.append(measures['readability grades']['Coleman-Liau'])
        fre.append(measures['readability grades']['FleschReadingEase'])
        gfi.append(measures['readability grades']['GunningFogIndex'])
        lix.append(measures['readability grades']['LIX'])
        si.append(measures['readability grades']['SMOGIndex'])
        rix.append(measures['readability grades']['RIX'])
        dci.append(measures['readability grades']['DaleChallIndex'])
        # Sentence
        cpw.append(measures['sentence info']['characters_per_word'])
        spw.append(measures['sentence info']['syll_per_word'])
        wps.append(measures['sentence info']['words_per_sentence'])
        ttr.append(measures['sentence info']['type_token_ratio'])
        c.append(measures['sentence info']['characters'])
        s.append(measures['sentence info']['syllables'])
        w.append(measures['sentence info']['words'])
        wt.append(measures['sentence info']['wordtypes'])
        lw.append(measures['sentence info']['long_words'])
        cw.append(measures['sentence info']['complex_words'])
        cwdc.append(measures['sentence info']['complex_words_dc'])
    
    features = pd.DataFrame()
    # Readability
    features['Kincaid'] = fkg
    features['ARI'] = ari
    features['Coleman-Liau'] = cli
    features['FleschReadingEase'] = fre
    features['GunningFogIndex'] = gfi
    features['LIX'] = lix
    features['SMOGIndex'] = si
    features['RIX'] = rix
    features['DaleChallIndex'] = dci
    # Sentence
    features['Characters per word'] = cpw
    features['Syllables per word'] = spw
    features['Words per sentence'] = wps
    features['Type toke ratio'] = ttr
    features['Characters'] = c
    features['Syllables'] = s
    features['Words'] = w
    features['Wordtypes'] = wt
    features['Long words'] = lw
    features['Complex words'] = cw
    features['Complex words dc'] = cwdc
    return features

In [None]:
# prepare data for feature extraction, check if paths are correct
filepath_train = "EnglishCleanedTrainingData.csv"
filepath_test = "EnglishCleanedTestingData.csv"
train_data = pd.read_csv(filepath_train)
test_data = pd.read_csv(filepath_test)
train_labels = train_data["labels"]
test_labels = test_data["labels"]

In [None]:
# NGRAMS generation + Frequency calculation -- NOTE no header column
ngram_train = ngrams(train_data['cleaned_tweet'], 1, 3, 'train') 
ngram_test = ngrams(test_data['cleaned_tweet'], 1, 3, 'test')
# Char-NGRAMS generation + Frequency calculation -- NOTE no header column
char_ngram_train = char_ngrams(train_data['cleaned_tweet'], 2, 5, 'train') 
char_ngram_test = char_ngrams(test_data['cleaned_tweet'], 2, 5, 'test')

In [None]:
# TFIDF Generation -- NOTE no header column
tfidf_train = tfidf(train_data['cleaned_tweet'], 1, 3, 'train')
tfidf_test = tfidf(test_data['cleaned_tweet'], 1, 3, 'test')
# TFIDF Generation -- NOTE no header column
char_tfidf_train = char_tfidf(train_data['cleaned_tweet'], 2, 5, 'train')
char_tfidf_test = char_tfidf(test_data['cleaned_tweet'], 2, 5, 'test')

In [None]:
# Sentiment Analysis into pandas dataframes
sentiment_train = sentimentAnalyzer(train_data['cleaned_tweet'], 'train') 
sentiment_test = sentimentAnalyzer(test_data['cleaned_tweet'], 'test')

In [None]:
# Liguistic feature extraction
linguistic_train = linguisticFeatures(train_data['tweet'], 'train')
linguistic_test = linguisticFeatures(test_data['tweet'], 'test')

In [None]:
# Readability scores
readability_train = readabilityScores(train_data['tweet'], 'train')
readability_test = readabilityScores(test_data['tweet'], 'test')

In [None]:
# combine all features into one pandas dataframe for train and test
print("Concatenating in process")
training_data = pd.concat([ngram_train, char_ngram_train, tfidf_train, char_tfidf_train, sentiment_train, 
                           linguistic_train, readability_train, train_labels], axis=1) # took out vecs_train
print("Did it!")

#testing_data = pd.concat([vecs_test, ngram_test, tfidf_test, sentiment_test, test_labels], axis=1) 
#export_csv2 = testing_data.to_csv('english_test_data.csv', index = None, header=True, encoding='utf-8')


In [None]:
def getData():
    return training_data