In [1]:
import re
import numpy as np
import pandas as pd
from gensim import utils
from markdown import markdown
from bs4 import BeautifulSoup
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.model_selection import KFold
import gensim.parsing.preprocessing as gsp
from sklearn.feature_extraction import text
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict
import scipy as sp
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
import random
random.seed(42)

In [3]:
TRAINING_TYPOLOGY_PATH = "training_typology.csv"
TESTING_TYPOLOGY_PATH = "step3_rules.csv"

training_df = pd.read_csv(TRAINING_TYPOLOGY_PATH)
testing_df = pd.read_csv(TESTING_TYPOLOGY_PATH)

In [6]:
training_df.head()

Unnamed: 0,UID,domain,communityID,lineID,text,lang,ref,timestamp,change_type,codify_stage,...,rule_norm_strategy,position_type,boundary_type,aggregation_type,payoff_type,information_type,communication_type,choice_type,scope_type,textID
0,private_wow_arguswow.com_7.1_NGOS_20200417_201...,private_wow,arguswow.com,7,"To pretend you're a gamemaster, threaten with ...",en,https://web.archive.org/web/20190101183548/for...,20190101,changed_before,wowdiff,...,rule,0,1,0,0,0,0,1,1,private_wow_arguswow.com_7.1_20190101
1,private_wow_arguswow.com_7.1_zaras_20200502_20...,private_wow,arguswow.com,7,"To pretend you're a gamemaster, threaten with ...",en,https://web.archive.org/web/20190101183548/for...,20190101,changed_before,wowdiff,...,norm,0,1,0,0,0,0,1,1,private_wow_arguswow.com_7.1_20190101
2,private_wow_arguswow.com_7.1_PASCALISJ_2020041...,private_wow,arguswow.com,7,"To pretend you're a gamemaster, threaten with ...",en,https://web.archive.org/web/20190101183548/for...,20190101,changed_before,wowdiff,...,rule,0,1,0,0,0,0,1,1,private_wow_arguswow.com_7.1_20190101
3,private_wow_arguswow.com_7.1_STEPHANOA_2020170...,private_wow,arguswow.com,7,"To pretend you're a gamemaster, threaten with ...",en,https://web.archive.org/web/20190101183548/for...,20190101,changed_before,wowdiff,...,rule,0,1,0,0,0,0,1,1,private_wow_arguswow.com_7.1_20190101
4,private_wow_arguswow.com_11.1_NGOS_20200417_20...,private_wow,arguswow.com,11,To trade characters in-game or selling them fo...,en,https://web.archive.org/web/20190101183548/for...,20190101,changed_before,wowdiff,...,rule,0,1,0,0,0,0,1,1,private_wow_arguswow.com_11.1_20190101


## Data preprocessing
- remove useless characters, whitespace, stopwords  
- lowercasing 
- stemming 
- apply one hot encoding on rule_norm_strategy and reg_const

In [4]:
def strip_html_markdown(s):
    if type(s) not in [int, float] and s is not None:
        return (
            " ".join(
                re.split(
                    "[ _<>,.!|:#*\n\[\]\?]+",
                    " ".join(
                        BeautifulSoup(markdown(s), "html.parser").findAll(text=True)
                    ),
                )
            )
            .lower()
            .strip()
        )


def whitespace_removal(df):
    df.rule_norm_strategy = df.rule_norm_strategy.apply(lambda x: x.strip())
    df.reg_const = df.reg_const.apply(lambda x: x.strip())
    df.domain = df.domain.apply(lambda x: x.strip())
    return df


filters = [
    gsp.strip_tags,
    gsp.strip_punctuation,
    gsp.strip_multiple_whitespaces,
    gsp.strip_numeric,
    gsp.remove_stopwords,
    gsp.strip_short,
    gsp.stem_text,
]


def clean_text(s):
    if type(s) not in [int, float] and s is not None:
        s = s.lower()
        s = utils.to_unicode(s)
        for f in filters:
            s = f(s)
        return s


def randomShuffle(training_df, testing_df):
    return (
        training_df.sample(frac=1).reset_index(drop=True),
        testing_df.sample(frac=1).reset_index(drop=True),
    )

In [7]:
col_list = [
    "text",
    "domain",
    "communityID",
    "IS",
    "reg_const",
    "rule_norm_strategy"
#    "position_type",
#    "boundary_type",
#    "aggregation_type",
#    "payoff_type",
#    "information_type",
#    "communication_type",
#    "choice_type",
#    "scope_type",
]

training_df = whitespace_removal(training_df)

training_df, testing_df = randomShuffle(training_df[col_list], testing_df)

In [8]:
len(training_df), len(testing_df)  # 75%/25%

(4816, 358026)

In [9]:
training_df.columns

Index(['text', 'domain', 'communityID', 'IS', 'reg_const',
       'rule_norm_strategy'],
      dtype='object')

In [10]:
training_df[0:5]

Unnamed: 0,text,domain,communityID,IS,reg_const,rule_norm_strategy
0,"12, 24, 48 or 72 hours depends on situation.",private_wow,retro-wow.com - wb,0,none,none
1,"- Trading characters and accounts for gold, it...",private_wow,wargate-project.org,0,none,none
2,You may submit multiple images at once in an a...,reddit,analog,1,regulatory,norm
3,You may post your own at your own risk.,reddit,gaybros,1,regulatory,norm
4,No explanation needed.,reddit,mechanicalkeyboards,1,regulatory,norm


## Data transformation
- generate corpus 
- transform the corpus to a normalized tf-idf representation

In [11]:
def corpusGen(df):
    return (
        df.text.apply(strip_html_markdown)
        .apply(lambda x: clean_text(x))
        .astype(str)
        .tolist()
    )


training_df['clean_text'] = corpusGen(training_df)
testing_df['clean_text'] = corpusGen(testing_df)
IS_corpus = training_df['clean_text'].tolist() + testing_df['clean_text'].tolist()
IS_source = training_df['text'].tolist() + testing_df['text'].tolist()

In [12]:
training_df[0:5]

Unnamed: 0,text,domain,communityID,IS,reg_const,rule_norm_strategy,clean_text
0,"12, 24, 48 or 72 hours depends on situation.",private_wow,retro-wow.com - wb,0,none,none,hour depend situat
1,"- Trading characters and accounts for gold, it...",private_wow,wargate-project.org,0,none,none,trade charact account gold item
2,You may submit multiple images at once in an a...,reddit,analog,1,regulatory,norm,submit multipl imag album count singl photo post
3,You may post your own at your own risk.,reddit,gaybros,1,regulatory,norm,post risk
4,No explanation needed.,reddit,mechanicalkeyboards,1,regulatory,norm,explan need


In [13]:
vec_word = TfidfVectorizer()
X_vec_word = vec_word.fit_transform(IS_corpus)
vec_char = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)
X_vec_char = vec_char.fit_transform(IS_source)
X_IS = sp.sparse.hstack([X_vec_word, X_vec_char])

In [14]:
X_IS.shape

(362842, 344999)

In [16]:
def get_precision_recall_f1(l, category, accuracy):
    precision_recall_df = pd.DataFrame(
        l, columns=["Precision", "Recall", "F1 Score", "Support"]
    )
    precision_recall_df.drop("Support", axis=1, inplace=True)
    precision_recall_df.insert(0, "Type", category)
    precision_recall_df.insert(1, "Accuracy", accuracy)
    return precision_recall_df

## IS Detector
- train clf_IS
- predict IS column on texts from wow, minecraft and reddit

In [17]:
l_IS = []
acc_IS = []

kfold = KFold(n_splits=10, shuffle = True, random_state=42)
y_IS = training_df.IS.values.tolist() + testing_df.IS.values.tolist()
clf_IS = OneVsRestClassifier(LinearSVC())

accuracy = cross_val_score(clf_IS, X_IS, y_IS, cv=kfold).mean()
y_IS_pred = cross_val_predict(clf_IS, X_IS, y_IS, cv=kfold)

acc_IS.append(accuracy)
l_IS.append(precision_recall_fscore_support(y_IS, y_IS_pred, average='weighted'))
IS_scores = get_precision_recall_f1(l_IS, "IS", acc_IS)
IS_scores

Unnamed: 0,Type,Accuracy,Precision,Recall,F1 Score
0,IS,0.913421,0.911502,0.913423,0.911844


In [12]:
# word_wow = vec_word.transform(wow_unlabeled.clean_text)
# word_mc = vec_word.transform(minecraft_unlabeled.clean_text)
# word_red = vec_word.transform(reddit_unlabeled.clean_text)

# char_wow = vec_char.transform(wow_unlabeled.text)
# char_mc = vec_char.transform(minecraft_unlabeled.text)
# char_red = vec_char.transform(reddit_unlabeled.text)

# IS_X_wow, IS_X_minecraft, IS_X_reddit = (
#     sp.sparse.hstack([word_wow, char_wow]),
#     sp.sparse.hstack([word_mc, char_mc]),
#     sp.sparse.hstack([word_red, char_red])
# )

# clf_IS.fit(X_IS, y_IS)
# wow_unlabeled["IS"] = clf_IS.predict(IS_X_wow)
# minecraft_unlabeled["IS"] = clf_IS.predict(IS_X_minecraft)
# reddit_unlabeled["IS"] = clf_IS.predict(IS_X_reddit)

In [16]:
# # predict on the coded texts
# coded_unlabeled = training_df.append(testing_df)
# coded_unlabeled = coded_unlabeled[["text", "communityID", "domain", "clean_text"]]
# coded_unlabeled["IS"] = clf_IS.predict(X_IS)

## Rule Typology Detector
- filter out texts with IS = 0
- train on institutional texts 

In [18]:
def domain_selection(train, test, domain_option = None):
    
    IS_train_df = train[(train["IS"] == 1)]
    IS_test_df = test[(test['IS'] == 1)]
    
    IS_train_df = IS_train_df[(IS_train_df['reg_const'] != 'none') & (IS_train_df['rule_norm_strategy'] != 'none')]
    IS_test_df = IS_test_df[(IS_test_df['reg_const'] != 'none') & (IS_test_df['rule_norm_strategy'] != 'none')]

    if domain_option is None:
        return IS_train_df, IS_test_df
        
    else:
        training_df = IS_train_df[IS_train_df["domain"] == domain_option]
        testing_df = IS_test_df[IS_test_df["domain"] == domain_option]
        return training_df, testing_df

    
def gen_X(train,test):
    
    rules_corpus = train['clean_text'].tolist() + test['clean_text'].tolist()
    rules_source = train['text'].tolist() + test['text'].tolist()
    
    word_rules = vec_word.fit_transform(rules_corpus)

    char_rules = vec_char.fit_transform(rules_source)
    
    X_rules = sp.sparse.hstack([word_rules, char_rules])

    return X_rules


clf_rules = OneVsRestClassifier(LinearSVC())

def get_scores(train, test, X_rules, IS_detector_scores):
    
#    categories = ['aggregation_type', 'boundary_type', 'choice_type', 
#                  'communication_type', 'information_type', 'payoff_type',
#                  'position_type', 'reg_const', 'rule_norm_strategy', 'scope_type']
    categories = ['reg_const','rule_norm_strategy']
    l_rules = []
    acc_rules = []
    for c in categories:
        y_rules = train[c].values.tolist() + test[c].values.tolist()

        accuracy = cross_val_score(clf_rules, X_rules, y_rules, cv=kfold).mean()
        y_pred = cross_val_predict(clf_rules, X_rules, y_rules, cv=kfold)
        
        acc_rules.append(accuracy)
        l_rules.append(precision_recall_fscore_support(y_rules, y_pred, average='weighted'))
        
    precision_recall_df = get_precision_recall_f1(l_rules, categories, acc_rules)
    
    return pd.concat([IS_detector_scores, precision_recall_df])

In [51]:
def label_data(train, test, X_rules, domain):
    
#    categories = ['aggregation_type', 'boundary_type', 'choice_type', 
#                  'communication_type', 'information_type', 'payoff_type',
#                  'position_type', 'reg_const', 'rule_norm_strategy', 'scope_type']
     categories = ['reg_const','rule_norm_strategy']
    for c in categories:
        y_rules = train[c].values.tolist() + test[c].values.tolist()
        clf_rules.fit(X_rules, y_rules)
        wow_unlabeled_IS[c] = clf_rules.predict(X_wow)
        minecraft_unlabeled_IS[c] = clf_rules.predict(X_minecraft)
        reddit_unlabeled_IS[c] = clf_rules.predict(X_reddit)
        coded_unlabeled_IS[c] = clf_rules.predict(X_coded)
        
    wow_labeled = wow.append(wow_unlabeled_IS)
    minecraft_labeled = minecraft.append(minecraft_unlabeled_IS)
    reddit_labeled = reddit.append(reddit_unlabeled_IS)
    coded_labeled = coded.append(coded_unlabeled_IS)
        
    wow_labeled.to_csv("wow_" + domain, index=False)
    minecraft_labeled.to_csv("minecraft_" + domain, index=False)
    reddit_labeled.to_csv("reddit_" + domain, index=False)
    coded_labeled.to_csv("coded_" + domain, index=False)
    
    return wow_labeled, minecraft_labeled, reddit_labeled, coded_labeled

In [20]:
reddit_train_df, reddit_test_df = domain_selection(training_df, testing_df)
reddit_X_rules = gen_X(reddit_train_df, reddit_test_df)
get_scores(reddit_train_df, reddit_test_df, reddit_X_rules, IS_scores) 

Unnamed: 0,Type,Accuracy,Precision,Recall,F1 Score
0,IS,0.913421,0.911502,0.913423,0.911844
0,reg_const,0.905415,0.897396,0.905416,0.897961
1,rule_norm_strategy,0.861966,0.848866,0.861964,0.851678
