In [1]:
import pandas as pd
import numpy as np
import pickle
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.stem.cistem import Cistem
import string
import re
from textstat.textstat import *
import seaborn
from textblob_de import TextBlobDE as TextBlob
import autosklearn.classification
import sklearn.metrics
import random
import os
import warnings
%matplotlib inline

  self.re = re.compile( self.reString )
  from numpy.core.umath_tests import inner1d


In [2]:
# Raw data
train_data = pd.read_csv("/home/mackenzie/Downloads/GermanTrainingData.txt", sep='\t', names=['tweet', 'coarse', 'labels'])
test_data = pd.read_csv("/home/mackenzie/Downloads/GermanTestingData.txt", sep='\t', names=['tweet', 'coarse', 'labels'])
df = pd.concat([train_data, test_data], ignore_index=True)
del train_data
del test_data

In [3]:
tweets=df.tweet

In [4]:
stopwords=stopwords = nltk.corpus.stopwords.words("german")

other_exclusions = ["lbr"]
stopwords.extend(other_exclusions)

stemmer = Cistem()

def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-ßA-ß]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-ßA-ß]+", tweet.lower())).strip()
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-ßA-ß.,!?]+", tweet.lower())).strip()
    return tweet.split()

vectorizer = TfidfVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords,
    use_idf=True,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=3000,
    min_df=5,
    max_df=0.75
    )

In [5]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

In [7]:
#Get POS tags for tweets and save as a string
tweet_tags = []
for t in tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    tag_str = " ".join(tag_list)
    tweet_tags.append(tag_str)

In [8]:
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
pos_vectorizer = TfidfVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None,
    use_idf=False,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=2000,
    min_df=5,
    max_df=0.75,
    )

In [9]:
#Construct POS TF matrix and get vocab dict
pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}

In [10]:
#Now get other features

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-ßA-ß]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    sentiment = TextBlob("tweet").sentiment
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words, lang='de_DE')
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet)
    features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment[0],twitter_objs[2], twitter_objs[1], twitter_objs[0]]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

# Changing string labels to numeric labels
def string_to_numeric(x):
    if x == 'OTHER' or x == 'PROFANITY':
        return 0
    if x == 'INSULT':
        return 1
    if x == 'ABUSE':
        return 2

In [11]:
other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "sentiment", "num_hashtags", "num_mentions", "num_urls"]

In [12]:
feats = get_feature_array(tweets)

In [13]:
#Now join them all up
M = np.concatenate([tfidf,pos,feats],axis=1)

In [14]:
M.shape

(8407, 3918)

In [15]:
#Finally get a list of variable names
variables = ['']*len(vocab)
for k,v in vocab.items():
    variables[v] = k

pos_variables = ['']*len(pos_vocab)
for k,v in pos_vocab.items():
    pos_variables[v] = k

feature_names = variables+pos_variables+other_features_names

In [16]:
X = pd.DataFrame(M)
y = df['labels'].apply(string_to_numeric)
X.columns = feature_names

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)

In [20]:
y_train = y_train.to_frame(name='labels')
y_test = y_test.to_frame(name='labels')

In [21]:
# can't run with columns more than 5000 or so it seems
print("start automl!")
automl = autosklearn.classification.AutoSklearnClassifier(ml_memory_limit=10000, time_left_for_this_task=3600, per_run_time_limit=360) #ensemble_size=0)
automl.fit(X_train, y_train, dataset_name='english_data') # feat_type=classification is another param we might use?

start automl!


  y = self._check_y(y)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)




  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)




  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
  Y_train_

1
['/tmp/autosklearn_tmp_9844_6004/.auto-sklearn/ensembles/1.0000000000.ensemble', '/tmp/autosklearn_tmp_9844_6004/.auto-sklearn/ensembles/1.0000000001.ensemble', '/tmp/autosklearn_tmp_9844_6004/.auto-sklearn/ensembles/1.0000000002.ensemble', '/tmp/autosklearn_tmp_9844_6004/.auto-sklearn/ensembles/1.0000000003.ensemble', '/tmp/autosklearn_tmp_9844_6004/.auto-sklearn/ensembles/1.0000000004.ensemble', '/tmp/autosklearn_tmp_9844_6004/.auto-sklearn/ensembles/1.0000000005.ensemble', '/tmp/autosklearn_tmp_9844_6004/.auto-sklearn/ensembles/1.0000000006.ensemble', '/tmp/autosklearn_tmp_9844_6004/.auto-sklearn/ensembles/1.0000000007.ensemble', '/tmp/autosklearn_tmp_9844_6004/.auto-sklearn/ensembles/1.0000000008.ensemble', '/tmp/autosklearn_tmp_9844_6004/.auto-sklearn/ensembles/1.0000000009.ensemble', '/tmp/autosklearn_tmp_9844_6004/.auto-sklearn/ensembles/1.0000000010.ensemble', '/tmp/autosklearn_tmp_9844_6004/.auto-sklearn/ensembles/1.0000000011.ensemble', '/tmp/autosklearn_tmp_9844_6004/.auto

AutoSklearnClassifier(delete_output_folder_after_terminate=True,
           delete_tmp_folder_after_terminate=True,
           disable_evaluator_output=False, ensemble_memory_limit=1024,
           ensemble_nbest=50, ensemble_size=50, exclude_estimators=None,
           exclude_preprocessors=None, get_smac_object_callback=None,
           include_estimators=None, include_preprocessors=None,
           initial_configurations_via_metalearning=25, logging_config=None,
           metadata_directory=None, ml_memory_limit=10000, n_jobs=None,
           output_folder=None, per_run_time_limit=360,
           resampling_strategy='holdout',
           resampling_strategy_arguments=None, seed=1, shared_mode=False,
           smac_scenario_args=None, time_left_for_this_task=3600,
           tmp_folder=None)

In [22]:
y_hat = automl.predict(X_test)

In [25]:
print("Accuracy Score:", sklearn.metrics.accuracy_score(y_test, y_hat))

Accuracy Score: 0.7443519619500595


In [26]:
automl.sprint_statistics() 

'auto-sklearn results:\n  Dataset name: english_data\n  Metric: accuracy\n  Best validation score: 0.724870\n  Number of target algorithm runs: 40\n  Number of successful target algorithm runs: 26\n  Number of crashed target algorithm runs: 6\n  Number of target algorithms that exceeded the time limit: 4\n  Number of target algorithms that exceeded the memory limit: 4\n'

In [27]:
print("Confusion Matrix: ")
print(sklearn.metrics.confusion_matrix(y_test, y_hat))

Confusion Matrix: 
[[548   9  30]
 [ 66  18   9]
 [ 96   5  60]]


In [32]:
# Create a new folder to hold statistics for a run
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

# create a random folder number to reference the directory
rand = random.randint(0, 1000)*random.randint(0,10)
directory = '/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/TestResults/Test_' + str(rand)
print('The results for this run will be stored in ' + directory)
createFolder(directory)

The results for this run will be stored in /home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/TestResults/Test_0


In [33]:
# Saves the cv results to a csv in the folder specified
results = automl.cv_results_ #A dict with keys as column headers and values as columns, 
                             #that can be imported into a pandas DataFrame

# dict --> pandas dataframe
cv_results = pd.DataFrame.from_dict(results)
print('CV Results partially shown below')
print(cv_results.head())
cv_results.to_csv(directory+'/cv_results.csv')

CV Results partially shown below
   mean_test_score  mean_fit_time  \
0         0.706448      13.814042   
1         0.000000     360.117545   
2         0.000000       4.236160   
3         0.000000       4.051167   
4         0.000000     360.135373   

                                              params  rank_test_scores  \
0  {'balancing:strategy': 'none', 'categorical_en...                 5   
1  {'balancing:strategy': 'none', 'categorical_en...                27   
2  {'balancing:strategy': 'weighting', 'categoric...                27   
3  {'balancing:strategy': 'weighting', 'categoric...                27   
4  {'balancing:strategy': 'weighting', 'categoric...                27   

    status param_balancing:strategy param_categorical_encoding:__choice__  \
0  Success                     none                      one_hot_encoding   
1  Timeout                     none                      one_hot_encoding   
2   Memout                weighting                      one_hot_enc

In [34]:
# Saves the model info to the directory specified
models = automl.get_models_with_weights() # Return a list of the final ensemble found by auto-sklearn.
model_info = pd.DataFrame.from_dict([models[0], models[1], models[2], models[3], models[4], models[5], models[6], models[7]])
                            #models[8], models[9], models[10], models[11]])
model_info.to_csv(directory+'/model_info.csv')
print('Model Results partially shown below')
print(models)

Model Results partially shown below
[(0.26, SimpleClassificationPipeline({'balancing:strategy': 'none', 'categorical_encoding:__choice__': 'one_hot_encoding', 'classifier:__choice__': 'passive_aggressive', 'imputation:strategy': 'mean', 'preprocessor:__choice__': 'liblinear_svc_preprocessor', 'rescaling:__choice__': 'none', 'categorical_encoding:one_hot_encoding:use_minimum_fraction': 'True', 'classifier:passive_aggressive:C': 6.772234112608738, 'classifier:passive_aggressive:average': 'True', 'classifier:passive_aggressive:fit_intercept': 'True', 'classifier:passive_aggressive:loss': 'hinge', 'classifier:passive_aggressive:tol': 0.0005430564287789773, 'preprocessor:liblinear_svc_preprocessor:C': 0.06949344912611048, 'preprocessor:liblinear_svc_preprocessor:dual': 'False', 'preprocessor:liblinear_svc_preprocessor:fit_intercept': 'True', 'preprocessor:liblinear_svc_preprocessor:intercept_scaling': 1, 'preprocessor:liblinear_svc_preprocessor:loss': 'squared_hinge', 'preprocessor:liblinea