In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import pandas as pd
import numpy as np
import re
import readability
import os

In [2]:
# Preparations
# Change directory path to get the file
#os.chdir('/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/')
os.chdir('C:\\Users\\mikec\\Documents')

In [3]:
# All Feature Extraction Functions
# NGrams Freq function
def ngrams(text, min_n, max_n, str):
    print("Completing ngram generation for " + str)
    bv = CountVectorizer(ngram_range=(min_n, max_n), max_features=1000)
    bv_matrix = bv.fit_transform(text).toarray()
    bv_vocab = bv.get_feature_names()
    bv_data = pd.DataFrame(bv_matrix, columns=bv_vocab)
    bv_data.columns = [col + '_nw' for col in bv_data.columns]
    return bv_data

# Char-NGrams Freq function
def char_ngrams(text, min_n, max_n, str):
    print("Completing char-ngram generation for " + str)
    bv = CountVectorizer(ngram_range=(min_n, max_n), max_features=1000, analyzer='char_wb')
    bv_matrix = bv.fit_transform(text).toarray()
    bv_vocab = bv.get_feature_names()
    bv_data = pd.DataFrame(bv_matrix, columns=bv_vocab)
    bv_data.columns = [col + '_nc' for col in bv_data.columns]
    return bv_data

# TFIDF function
def tfidf(text, min_n, max_n, str):
    print("Completing tfidf+ngram generation for " + str)
    tv = TfidfVectorizer(ngram_range=(min_n, max_n), max_features=1000)
    tv_matrix = tv.fit_transform(text).toarray()
    tv_vocab = tv.get_feature_names()
    tv_data = pd.DataFrame(np.round(tv_matrix, 2), columns=tv_vocab)
    tv_data.columns = [col + '_tw' for col in tv_data.columns]
    return tv_data

# Char-TFIDF function
def char_tfidf(text, min_n, max_n, str):
    print("Completing char-tfidf+ngram generation for " + str)
    tv = TfidfVectorizer(ngram_range=(min_n, max_n), max_features=1000, analyzer='char_wb')
    tv_matrix = tv.fit_transform(text).toarray()
    tv_vocab = tv.get_feature_names()
    tv_data = pd.DataFrame(np.round(tv_matrix, 2), columns=tv_vocab)
    tv_data.columns = [col + '_tc' for col in tv_data.columns]
    return tv_data

# Sentiment Analysis function
def sentimentAnalyzer(tweets, str):
    sid = SentimentIntensityAnalyzer()
    sentiment_val = pd.DataFrame(columns = ['sentiment'])
    print("Completing the sentiment analysis for " + str)
    for i in range(0, len(tweets)):
        ss = sid.polarity_scores(tweets[i])
        sentiment_val.at[i, 'sentiment'] = ss.get('compound')
    return sentiment_val

# Linguistic Feature Extraction 
def text_length(text):
    return len(text)

def number_of_tokens(text):
    tokens = nltk.word_tokenize(text)
    return len(tokens)

def number_of_mentions(text):
    return len(re.findall(r"@\S+", text))

def number_of_hashtags(text):
    return len(re.findall(r"#\S+", text))

def number_of_links(text):
    return len(re.findall(r"http\S+", text))

def number_of_emoticons(text):
    count = 0
    for character in text:
        if character in emoji.UNICODE_EMOJI:
            count += 1
    return count

def linguisticFeatures(tweets, str):
    print("Completing the liguistic feature extraction for " + str)
    tl, nt, nom, noh, nol, noem = [], [], [], [], [], []
    for i in range(0, len(tweets)):
        tl.append(text_length(tweets[i]))
        nom.append(number_of_mentions(tweets[i]))
        noh.append(number_of_hashtags(tweets[i]))
        nol.append(number_of_links(tweets[i]))
        noem.append(number_of_emoticons(tweets[i]))
        nt.append(number_of_tokens(tweets[i]))
    features = pd.DataFrame()
    features['text length'] = tl
    features['number of words'] = nt
    features['number of mentions'] = nom
    features['number of hashtags'] = noh
    features['number of links'] = nol
    features['number of emoticons'] = noem
    return features

# Readability score extraction
def remove_user_names(text):
    text = re.sub(r'@\S+', '', text)
    return text

def remove_hashtags(text):
    text = re.sub(r'#\S+', '', text)
    return text

def remove_links(text):
    text = re.sub(r'http\S+', '', text)
    return text

def remove_underscore(text):
    text = text.replace('_', '')
    return text

def remove_emojis(text):
    return ''.join(c for c in text if c not in emoji.UNICODE_EMOJI)

def readabilityScores(tweets, str):
    print("Completing the readability scores extraction for " + str)
    fkg, ari, cli, fre, gfi, lix, si, rix, dci = [], [], [], [], [], [], [], [], []
    cpw, spw, wps, ttr, c, s, w, wt, lw, cw, cwdc = [], [], [], [], [], [], [], [], [], [], []
    for i in range(0, len(tweets)):
        tweet = tweets[i]
        tweet = remove_user_names(tweet)
        tweet = remove_hashtags(tweet)
        tweet = remove_links(tweet)
        tweet = remove_underscore(tweet)
        tweet = remove_emojis(tweet)
        measures = readability.getmeasures(tweet, lang='de')
        fkg.append(measures['readability grades']['Kincaid'])
        ari.append(measures['readability grades']['ARI'])
        cli.append(measures['readability grades']['Coleman-Liau'])
        fre.append(measures['readability grades']['FleschReadingEase'])
        gfi.append(measures['readability grades']['GunningFogIndex'])
        lix.append(measures['readability grades']['LIX'])
        si.append(measures['readability grades']['SMOGIndex'])
        rix.append(measures['readability grades']['RIX'])
        dci.append(measures['readability grades']['DaleChallIndex'])
        # Sentence
        cpw.append(measures['sentence info']['characters_per_word'])
        spw.append(measures['sentence info']['syll_per_word'])
        wps.append(measures['sentence info']['words_per_sentence'])
        ttr.append(measures['sentence info']['type_token_ratio'])
        c.append(measures['sentence info']['characters'])
        s.append(measures['sentence info']['syllables'])
        w.append(measures['sentence info']['words'])
        wt.append(measures['sentence info']['wordtypes'])
        lw.append(measures['sentence info']['long_words'])
        cw.append(measures['sentence info']['complex_words'])
        cwdc.append(measures['sentence info']['complex_words_dc'])
    
    features = pd.DataFrame()
    # Readability
    features['Kincaid'] = fkg
    features['ARI'] = ari
    features['Coleman-Liau'] = cli
    features['FleschReadingEase'] = fre
    features['GunningFogIndex'] = gfi
    features['LIX'] = lix
    features['SMOGIndex'] = si
    features['RIX'] = rix
    features['DaleChallIndex'] = dci
    # Sentence
    features['Characters per word'] = cpw
    features['Syllables per word'] = spw
    features['Words per sentence'] = wps
    features['Type toke ratio'] = ttr
    features['Characters'] = c
    features['Syllables'] = s
    features['Words'] = w
    features['Wordtypes'] = wt
    features['Long words'] = lw
    features['Complex words'] = cw
    features['Complex words dc'] = cwdc
    return features

In [4]:
# prepare data for feature extraction, check if paths are correct
filepath_data = "GermanCleanedData.csv"
data = pd.read_csv(filepath_data)
labels = data["labels"]

In [5]:
# NGRAMS generation + Frequency calculation -- NOTE no header column
ngram_data = ngrams(data['cleaned_tweet'], 1, 3, 'data') 
# Char-NGRAMS generation + Frequency calculation -- NOTE no header column
char_ngram_data = char_ngrams(data['cleaned_tweet'], 2, 5, 'data') 

Completing ngram generation for data
Completing char-ngram generation for data


In [6]:
# TFIDF Generation -- NOTE no header column
tfidf_data = tfidf(data['cleaned_tweet'], 1, 3, 'data')
# TFIDF Generation -- NOTE no header column
char_tfidf_data = char_tfidf(data['cleaned_tweet'], 2, 5, 'data')

Completing tfidf+ngram generation for data
Completing char-tfidf+ngram generation for data


In [None]:
# Sentiment Analysis into pandas dataframes
sentiment_data = sentimentAnalyzer(data['cleaned_tweet'], 'data')

In [None]:
# Liguistic feature extraction
linguistic_data = linguisticFeatures(data['tweet'], 'data')

In [None]:
# Readability scores
readability_data = readabilityScores(data['tweet'], 'data')

In [None]:
# combine all features into one pandas dataframe for train and test
print("Concatenating in process")
featured_data = pd.concat([ngram_data, char_ngram_data, tfidf_data, char_tfidf_data, sentiment_data, 
                           linguistic_data, readability_data, labels], axis=1)
print("Finished it!")

In [None]:
# Split the data set into three data sets based on the labels
for labels, d in featured_data.groupby('labels'):
    globals()['data_' + str(labels)] = d
del d

# Find the 80% cut-line for each data set
cut_0 = round(len(data_0.index) * 0.8)
cut_1 = round(len(data_1.index) * 0.8)
cut_2 = round(len(data_2.index) * 0.8)

# Construct train and test data sets
train = pd.concat([data_0.iloc[:cut_0, :], data_1.iloc[:cut_1, :], data_2.iloc[:cut_2, :]])
train = train.reindex(np.random.permutation(train.index))
test = pd.concat([data_0.iloc[cut_0:, :], data_1.iloc[cut_1:, :], data_2.iloc[cut_2:, :]])
test = test.reindex(np.random.permutation(test.index))

In [None]:
def getTrainData():
    return train
def getTestData():
    return test