In [None]:
import pandas as pd
import numpy as np
import pickle
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import nltk
from nltk.stem.cistem import Cistem
import string
import re
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *
import matplotlib.pyplot as plt
from imblearn.under_sampling import (ClusterCentroids, RandomUnderSampler,
                                     NearMiss,
                                     InstanceHardnessThreshold)
import seaborn
from textblob_de import TextBlobDE as TextBlob
import warnings
import h2o
from h2o.automl import H2OAutoML
import random 
import os
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
%matplotlib inline

In [None]:
# Raw data
train_data = pd.read_csv("/home/mackenzie/Downloads/GermanTrainingData.txt", sep='\t', names=['tweet', 'coarse', 'labels'])
test_data = pd.read_csv("/home/mackenzie/Downloads/GermanTestingData.txt", sep='\t', names=['tweet', 'coarse', 'labels'])
df = pd.concat([train_data, test_data], ignore_index=True)
del train_data
del test_data

In [None]:
tweets=df.tweet

# function to change accents for text processing purposes
def convert_umlauts(text):
    temp = text
    temp = temp.replace('ä', 'ae')
    temp = temp.replace('ö', 'oe')
    temp = temp.replace('ü', 'ue')
    temp = temp.replace('Ä', 'Ae')
    temp = temp.replace('Ö', 'Oe')
    temp = temp.replace('Ü', 'Ue')
    temp = temp.replace('ß', 'ss')
    return temp

# call the tweets below to change accented word to their roots
for i in range(0, len(tweets)):
    curr = tweets.iloc[i]
    tweets.iloc[i] = convert_umlauts(curr)

In [None]:
stopwords=stopwords = nltk.corpus.stopwords.words("german")

other_exclusions = ["lbr", "|lbr|", "»"]
stopwords.extend(other_exclusions)

stemmer = Cistem()

def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-ßA-ß]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-ßA-ß]+", tweet.lower())).strip()
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-ßA-ß.,!?]+", tweet.lower())).strip()
    return tweet.split()

vectorizer = TfidfVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords,
    use_idf=True,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=3000,
    min_df=5,
    max_df=0.75
    )

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

In [None]:
#Now get other features
sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-ßA-ß]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    sentiment = TextBlob("tweet").sentiment
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words, lang='de_DE')
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet)
    features = [FKRA, FRE, syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment[0],twitter_objs[2], twitter_objs[1], twitter_objs[0]]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

# Changing string labels to numeric labels
def string_to_numeric(x):
    if x == 'OTHER' or x == 'PROFANITY':
        return 0
    if x == 'INSULT':
        return 1
    if x == 'ABUSE':
        return 2

In [None]:
other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "sentiment", "num_hashtags", "num_mentions", "num_urls"]

In [None]:
feats = get_feature_array(tweets)

In [None]:
#Now join them all up
M = np.concatenate([tfidf,feats],axis=1)

In [None]:
M.shape

In [None]:
#Finally get a list of variable names
variables = ['']*len(vocab)
for k,v in vocab.items():
    variables[v] = k

feature_names = variables+other_features_names

In [None]:
X = pd.DataFrame(M)
y = df['labels'].apply(string_to_numeric)
X.columns = feature_names

In [None]:
'''
# Univariate feature selection from sklearn
bestfeatures = SelectKBest(score_func=f_classif, k=1000)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print('Univariate Selection features found, use getUnivariateData() to get the features')
# Extract the top n features
uni_selected_feat = featureScores.nlargest(1000,'Score')
print(uni_selected_feat) # print out the top n features selected

# Saving the top n features to a data frame
#print(a.iloc[0].name) # how to get the column # for the ith feature
#print(a.iloc[0][0]) # how to get the header column
best_features = pd.DataFrame()
for i in range(0, 1000):
    curr_column_vals = X.iloc[:, uni_selected_feat.iloc[i].name]
    curr_column_name = uni_selected_feat.iloc[i][0]
    best_features[curr_column_name] = curr_column_vals

X = pd.DataFrame(best_features)
X.columns = best_features.columns
print(X.columns)
feature_names = best_features.columns
'''

In [None]:
'''
# Feature Selection
model = ExtraTreesClassifier(n_estimators=10) #n_estimators=300
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers

#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
top_feat_impt = feat_importances.nlargest(500) 
print(top_feat_impt) # prints out the n best features

# Saving the top n features to a dataframe
list_names = top_feat_impt.axes 
best_features = pd.DataFrame()
#print(X.columns.get_loc(list[0][0])) # how to get the index of the column/name from the feature selected names
for i in range(0, 500):
    curr_column_name = list_names[0][i]
    curr_column_index = X.columns.get_loc(curr_column_name)
    curr_column_vals = X.iloc[:, curr_column_index]
    best_features[curr_column_name] = curr_column_vals

X = pd.DataFrame(best_features)
feature_names = best_features.columns
X.columns = feature_names
'''

In [None]:
print(len(X))
print(len(y))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)

In [None]:
y_train = y_train.to_frame(name='labels')
y_test = y_test.to_frame(name='labels')

In [None]:
# Random Undersampling from imblearn
print("Random Undersampling")
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)
X_train = pd.DataFrame(X_res)
X_train.columns = feature_names
y_train = pd.DataFrame(y_res)
y_train.columns = ['labels']

In [None]:
# Cluster Centroid undersampling
print("Cluster Centroids")
cc = ClusterCentroids(random_state=2)
X_res, y_res = cc.fit_resample(X_train, y_train)
X_train = pd.DataFrame(X_res)
X_train.columns = feature_names
y_train = pd.DataFrame()
y_train['labels'] = y_res

In [None]:
# Nearmiss Undersampling
print("Nearmiss")
nm = NearMiss(random_state=2)
X_res, y_res = nm.fit_resample(X_train, y_train)
X_train = pd.DataFrame(X_res)
X_train.columns = feature_names
y_train = pd.DataFrame()
y_train['labels'] = y_res

In [None]:
# Instance Hardness Threshold
print("Instance Harness Threshold")
iht = InstanceHardnessThreshold(random_state=0, estimator=LogisticRegression(class_weight='balance'))
X_res, y_res = iht.fit_resample(X_train, y_train)
X_train = pd.DataFrame(X_res)
X_train.columns = feature_names
y_train = pd.DataFrame()
y_train['labels'] = y_res

In [None]:
h2o.init()

In [None]:
# BUG: when we convert to h2o dataframe an extra row is added
X_train = h2o.H2OFrame(X_train)
y_train = h2o.H2OFrame(y_train)
X_test = h2o.H2OFrame(X_test)
y_test = h2o.H2OFrame(y_test) 

In [None]:
X_train = X_train.na_omit() # pandas-->h2o results in an error where a nan row is added to X data, so this deals with that
X_test = X_test.na_omit()

In [None]:
# Double check X and y have same number of rows
print(len(y_train))
print(len(X_train))
print(len(y_test))
print(len(X_test))

In [None]:
X_train.shape

In [None]:
# preparing the train and test data sets
# now convert tweet vecs and labels to a pandas dataframe and back to h2o dataframe
train = X_train.cbind(y_train)
test = X_test.cbind(y_test)

In [None]:
# more on data prep
x = train.columns         # x: A list/vector of predictor column names or indexes. 
                          # This argument only needs to be specified if the user wants to exclude columns from the 
                          # set of predictors. If all columns (other than the response) should be used in prediction, 
                          # then this does not need to be set.

y = "labels"              # This argument is the name (or index) of the response column
x.remove(y)

# need to set train and test
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [None]:
# now the AUTO-ML piece comes in
aml = H2OAutoML(max_runtime_secs=1800) #sort_metric=auc, max_runtime_secs=10800, class_sampling_factors = sample_factors, max_models=?, balance_classes=True
aml.train(x=x, y=y, training_frame=train)

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)
lb_pd = lb.as_data_frame()

In [None]:
# The leader model is stored here
aml.leader

In [None]:
# predictions!
preds = aml.predict(test)
print(preds)
var = preds["predict"].cbind(test[y])
print(var)

In [None]:
# Create a new folder to hold statistics for a run
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

# create a random folder number to reference the directory
rand = random.randint(0, 1000)
directory = '/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/Results_Aug6/TestRecursiveUndSamp_' + str(rand)
print('The results for this run will be stored in ' + directory)
createFolder(directory)

In [None]:
# metrics and results!
y_test = h2o.as_list(test[y], use_pandas=True)
y_pred = h2o.as_list(preds["predict"])
print("Confusion Matrix: ")
print(metrics.confusion_matrix(y_test, y_pred))
print("Accuracy Score: ")
print(metrics.accuracy_score(y_test, y_pred))
print("F1 Score: ")
print(metrics.f1_score(y_test, y_pred, average="weighted"))
print("Recall: ")
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
matrix_proportions = np.zeros((3,3))
for i in range(0,3):
    matrix_proportions[i,:] = confusion_matrix[i,:]/float(confusion_matrix[i,:].sum())
names=['Other','Insult','Abuse']
confusion_df = pd.DataFrame(matrix_proportions, index=names,columns=names)
plt.figure(figsize=(5,5))
seaborn.heatmap(confusion_df,annot=True,annot_kws={"size": 12},cmap='gist_gray_r',cbar=False, square=True,fmt='.2f')
plt.ylabel(r'True categories',fontsize=14)
plt.xlabel(r'Predicted categories',fontsize=14)
plt.tick_params(labelsize=12)
plt.savefig(directory + '/recall.png')

In [None]:
# Saves the classification report info
print(metrics.classification_report(y_test, y_pred))
f= open(directory+'/classification_report.txt',"w+")
f.write(metrics.classification_report(y_test, y_pred))
f.write(str(metrics.confusion_matrix(y_test,y_pred)))
f.write('\n')
f.write('accuracy: '+ str(metrics.accuracy_score(y_test, y_pred)))
f.write('\n')
f.write('f1-score' + str(metrics.f1_score(y_test, y_pred, average="weighted")))
f.write('\n')
f.close()

In [None]:
file = lb_pd.to_csv(path_or_buf=directory+'/modelinfo.csv')
featuresSelected = best_features.to_csv(path_or_buf=directory+'/feature_selection.csv')