ISSUE: My dataset doesn't have language included, and has entries in non-English langauges. The training/testing data is only in English.

In [1]:
import re
import numpy as np
import pandas as pd
from gensim import utils
from markdown import markdown
from bs4 import BeautifulSoup
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.model_selection import KFold
import gensim.parsing.preprocessing as gsp
from sklearn.feature_extraction import text
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict
import scipy as sp
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
import random
random.seed(42)

In [2]:
TRAINING_TYPOLOGY_PATH = "training_typology.csv"
TESTING_TYPOLOGY_PATH = "testing_typology.csv"

training_df = pd.read_csv(TRAINING_TYPOLOGY_PATH)
testing_df = pd.read_csv(TESTING_TYPOLOGY_PATH)

In [3]:
training_df = training_df[training_df.domain == 'reddit']
testing_df = testing_df[testing_df.domain == 'reddit']

In [4]:
training_df.head()

Unnamed: 0,UID,domain,communityID,lineID,text,lang,ref,timestamp,change_type,codify_stage,...,rule_norm_strategy,position_type,boundary_type,aggregation_type,payoff_type,information_type,communication_type,choice_type,scope_type,textID
794,reddit_misanthropy_2.103_NGOS_20200608_20180719,reddit,misanthropy,2,English only.,en,https://www.reddit.com/r/misanthropy/,20180719,unchanged,typol_redd_diff_20WQ,...,norm,0,0,0,0,0,0,0,0,reddit_misanthropy_2.103_20180719
795,reddit_misanthropy_2.103_zaras_20200607_20180719,reddit,misanthropy,2,English only.,en,https://www.reddit.com/r/misanthropy/,20180719,unchanged,typol_redd_diff_20WQ,...,norm,0,0,0,0,0,0,0,0,reddit_misanthropy_2.103_20180719
796,reddit_misanthropy_2.103_PASCALISJ_20200603_20...,reddit,misanthropy,2,English only.,en,https://www.reddit.com/r/misanthropy/,20180719,unchanged,typol_redd_diff_20WQ,...,norm,0,0,0,0,0,1,1,0,reddit_misanthropy_2.103_20180719
797,reddit_misanthropy_2.103_DHANOAB_20200607_2018...,reddit,misanthropy,2,English only.,en,https://www.reddit.com/r/misanthropy/,20180719,unchanged,typol_redd_diff_20WQ,...,norm,0,0,0,0,0,0,0,0,reddit_misanthropy_2.103_20180719
798,reddit_misanthropy_3.92_NGOS_20200608_20180719,reddit,misanthropy,3,All humans deserve criticism.. We don't play ...,en,https://www.reddit.com/r/misanthropy/,20180719,unchanged,typol_redd_diff_20WQ,...,strategy,0,0,0,0,0,0,0,0,reddit_misanthropy_3.92_20180719


In [5]:
set(training_df.reg_const)

{'constitutive', 'none', 'regulatory'}

In [6]:
set(training_df.lang)

{'en'}

## Data preprocessing
- remove useless characters, whitespace, stopwords  
- lowercasing 
- stemming 
- apply one hot encoding on rule_norm_strategy and reg_const

In [7]:
def strip_html_markdown(s):
    if type(s) not in [int, float] and s is not None:
        return (
            " ".join(
                re.split(
                    "[ _<>,.!|:#*\n\[\]\?]+",
                    " ".join(
                        BeautifulSoup(markdown(s), "html.parser").findAll(text=True)
                    ),
                )
            )
            .lower()
            .strip()
        )


def whitespace_removal(df):
    df.rule_norm_strategy = df.rule_norm_strategy.apply(lambda x: x.strip())
    df.reg_const = df.reg_const.apply(lambda x: x.strip())
    df.domain = df.domain.apply(lambda x: x.strip())
    return df


filters = [
    gsp.strip_tags,
    gsp.strip_punctuation,
    gsp.strip_multiple_whitespaces,
    gsp.strip_numeric,
    gsp.remove_stopwords,
    gsp.strip_short,
    gsp.stem_text,
]


def clean_text(s):
    if type(s) not in [int, float] and s is not None:
        s = s.lower()
        s = utils.to_unicode(s)
        for f in filters:
            s = f(s)
        return s


def randomShuffle(training_df, testing_df):
    return (
        training_df.sample(frac=1).reset_index(drop=True),
        testing_df.sample(frac=1).reset_index(drop=True),
    )

In [8]:
col_list = [
    "text",
    "communityID",
    "IS",
    "reg_const",
    "rule_norm_strategy"
]

training_df, testing_df = (
    whitespace_removal(training_df),
    whitespace_removal(testing_df),
)

training_df, testing_df = randomShuffle(training_df[col_list], testing_df[col_list])

In [9]:
len(training_df), len(testing_df)  # 75%/25%

(3975, 1339)

In [10]:
training_df.columns

Index(['text', 'communityID', 'IS', 'reg_const', 'rule_norm_strategy'], dtype='object')

In [11]:
training_df[0:5]

Unnamed: 0,text,communityID,IS,reg_const,rule_norm_strategy
0,This also applies to YouTube channels of low-e...,usmc,1,constitutive,norm
1,Please refer to the following before posting.,vainglorygame,1,regulatory,norm
2,"If you post something in Arabic, or any langua...",izlam,1,regulatory,norm
3,Posts should be intended to invite and drive d...,sexover30,1,regulatory,norm
4,No racism or hate speech.. Racism and hate sp...,entertainment,1,regulatory,norm


## Data transformation
- generate corpus 
- transform the corpus to a normalized tf-idf representation

In [12]:
def corpusGen(df):
    return (
        df.text.apply(strip_html_markdown)
        .apply(lambda x: clean_text(x))
        .astype(str)
        .tolist()
    )


training_df['clean_text'] = corpusGen(training_df)
testing_df['clean_text'] = corpusGen(testing_df)
IS_corpus = training_df['clean_text'].tolist() + testing_df['clean_text'].tolist()
IS_source = training_df['text'].tolist() + testing_df['text'].tolist()

In [13]:
training_df[0:5]

Unnamed: 0,text,communityID,IS,reg_const,rule_norm_strategy,clean_text
0,This also applies to YouTube channels of low-e...,usmc,1,constitutive,norm,appli youtub channel low effort content
1,Please refer to the following before posting.,vainglorygame,1,regulatory,norm,refer follow post
2,"If you post something in Arabic, or any langua...",izlam,1,regulatory,norm,post arab languag translat english
3,Posts should be intended to invite and drive d...,sexover30,1,regulatory,norm,post intend invit drive discuss
4,No racism or hate speech.. Racism and hate sp...,entertainment,1,regulatory,norm,racism hate speech racism hate speech allow


In [14]:
vec_word = TfidfVectorizer()
X_vec_word = vec_word.fit_transform(IS_corpus)
vec_char = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)
X_vec_char = vec_char.fit_transform(IS_source)
X_IS = sp.sparse.hstack([X_vec_word, X_vec_char])

In [15]:
X_IS.shape

(5314, 18807)

In [16]:
def get_precision_recall_f1(l, category, accuracy):
    precision_recall_df = pd.DataFrame(
        l, columns=["Precision", "Recall", "F1 Score", "Support"]
    )
    precision_recall_df.drop("Support", axis=1, inplace=True)
    precision_recall_df.insert(0, "Type", category)
    precision_recall_df.insert(1, "Accuracy", accuracy)
    return precision_recall_df

## IS Detector
- train clf_IS
- predict IS column on texts from wow, minecraft and reddit

In [17]:
l_IS = []
acc_IS = []

kfold = KFold(n_splits=10, shuffle = True, random_state=42)
y_IS = training_df.IS.values.tolist() + testing_df.IS.values.tolist()
clf_IS = OneVsRestClassifier(LinearSVC())

accuracy = cross_val_score(clf_IS, X_IS, y_IS, cv=kfold).mean()
y_IS_pred = cross_val_predict(clf_IS, X_IS, y_IS, cv=kfold)

acc_IS.append(accuracy)
l_IS.append(precision_recall_fscore_support(y_IS, y_IS_pred, average='weighted'))
IS_scores = get_precision_recall_f1(l_IS, "IS", acc_IS)
IS_scores

Unnamed: 0,Type,Accuracy,Precision,Recall,F1 Score
0,IS,0.908173,0.906395,0.908167,0.906693


## Rule Typology Detector
- filter out texts with IS = 0
- train on institutional texts 

In [18]:
def domain_selection(train, test):
    
    IS_train_df = train[(train["IS"] == 1)]
    IS_test_df = test[(test['IS'] == 1)]
    
    IS_train_df = IS_train_df[(IS_train_df['reg_const'] != 'none') & (IS_train_df['rule_norm_strategy'] != 'none')]
    IS_test_df = IS_test_df[(IS_test_df['reg_const'] != 'none') & (IS_test_df['rule_norm_strategy'] != 'none')]

    return IS_train_df, IS_test_df
        
    
def gen_X(train,test):
    
    rules_corpus = train['clean_text'].tolist() + test['clean_text'].tolist()
    rules_source = train['text'].tolist() + test['text'].tolist()
    
    word_rules = vec_word.fit_transform(rules_corpus)
    char_rules = vec_char.fit_transform(rules_source)
    X_rules = sp.sparse.hstack([word_rules, char_rules])

    return X_rules


clf_rules = OneVsRestClassifier(LinearSVC())

def get_scores(train, test, X_rules, IS_detector_scores):    
    categories = ['reg_const','rule_norm_strategy']
    l_rules = []
    acc_rules = []
    for c in categories:
        y_rules = train[c].values.tolist() + test[c].values.tolist()

        accuracy = cross_val_score(clf_rules, X_rules, y_rules, cv=kfold).mean()
        y_pred = cross_val_predict(clf_rules, X_rules, y_rules, cv=kfold)
        
        acc_rules.append(accuracy)
        l_rules.append(precision_recall_fscore_support(y_rules, y_pred, average='weighted'))
        
    precision_recall_df = get_precision_recall_f1(l_rules, categories, acc_rules)
    
    return pd.concat([IS_detector_scores, precision_recall_df])

In [19]:
# def label_data(train, test, X_rules, domain):
#     categories = ['reg_const','rule_norm_strategy']
#     for c in categories:
#         y_rules = train[c].values.tolist() + test[c].values.tolist()
#         clf_rules.fit(X_rules, y_rules)
#         wow_unlabeled_IS[c] = clf_rules.predict(X_wow)
#         minecraft_unlabeled_IS[c] = clf_rules.predict(X_minecraft)
#         reddit_unlabeled_IS[c] = clf_rules.predict(X_reddit)
#         coded_unlabeled_IS[c] = clf_rules.predict(X_coded)
        
#     wow_labeled = wow.append(wow_unlabeled_IS)
#     minecraft_labeled = minecraft.append(minecraft_unlabeled_IS)
#     reddit_labeled = reddit.append(reddit_unlabeled_IS)
#     coded_labeled = coded.append(coded_unlabeled_IS)
        
#     wow_labeled.to_csv("wow_" + domain, index=False)
#     minecraft_labeled.to_csv("minecraft_" + domain, index=False)
#     reddit_labeled.to_csv("reddit_" + domain, index=False)
#     coded_labeled.to_csv("coded_" + domain, index=False)
    
#     return wow_labeled, minecraft_labeled, reddit_labeled, coded_labeled

In [20]:
reddit_train_df, reddit_test_df = domain_selection(training_df, testing_df)
reddit_X_rules = gen_X(reddit_train_df, reddit_test_df)
get_scores(reddit_train_df, reddit_test_df, reddit_X_rules, IS_scores) 

Unnamed: 0,Type,Accuracy,Precision,Recall,F1 Score
0,IS,0.908173,0.906395,0.908167,0.906693
0,reg_const,0.902036,0.892103,0.90203,0.893551
1,rule_norm_strategy,0.894264,0.881063,0.894262,0.881858


## My Data Preprocessing

In [29]:
my_data = pd.read_csv("step3_rules.csv")

#split sentences
my_data['text'] = my_data.text.str.split(".")
my_data = my_data.explode('text')
my_data = my_data[my_data.text != '']

#get rid of unnecessary stuff
my_data.reset_index(inplace=True)
my_data.drop(columns=['lineID', 'domain', 'ref', 'source', 'index'], inplace=True)
my_data.dropna(subset='text', inplace=True)

In [30]:
my_data

Unnamed: 0,change_type,before/after,timestamp,timestamp_rule,communityID,ruleID,text
0,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_0,Making others feel unsafe or disrespected will...
1,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_1,There are many other music subreddits for song...
2,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_2,Post format: Artist - Title [Genre] Additional...
3,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_3,"If a music video contains flashing lights, hea..."
4,unchanged,unchanged,April 23,20170501,elbowsafespace,elbowsafespace_0,No harassment of any users for any reason
...,...,...,...,...,...,...,...
741541,unchanged,unchanged,April 23,20210101,improvemyfiverrgig,improvemyfiverrgig_0,This subreddit is for requesting and giving ad...
741542,unchanged,unchanged,April 23,20180901,sempiternafloreat,sempiternafloreat_0,All posts should at least be loosely related t...
741543,unchanged,unchanged,April 23,20180901,sempiternafloreat,sempiternafloreat_1,"NSFW posts should be tagged as such, don't wan..."
741544,unchanged,unchanged,April 23,20180901,sempiternafloreat,sempiternafloreat_2,Try to keep comments and or posts somewhat civ...


In [31]:
my_data['clean_text'] = corpusGen(my_data)
word_my_data = vec_word.transform(my_data.clean_text)
char_my_data = vec_char.transform(my_data.text)
X_my_data = sp.sparse.hstack([word_my_data, char_my_data]) 

## My data labelling

In [32]:
categories = ['reg_const','rule_norm_strategy']
for c in categories:
    y_rules = reddit_train_df[c].values.tolist() + reddit_test_df[c].values.tolist()
    clf_rules.fit(reddit_X_rules, y_rules)
    my_data[c + "_labels"] = clf_rules.predict(X_my_data)

In [33]:
my_data

Unnamed: 0,change_type,before/after,timestamp,timestamp_rule,communityID,ruleID,text,clean_text,reg_const_labels,rule_norm_strategy_labels
0,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_0,Making others feel unsafe or disrespected will...,make feel unsaf disrespect toler,regulatory,norm
1,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_1,There are many other music subreddits for song...,music subreddit song wai,regulatory,norm
2,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_2,Post format: Artist - Title [Genre] Additional...,post format artist titl genr addit inform option,regulatory,norm
3,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_3,"If a music video contains flashing lights, hea...",music video contain flash light heavi eyestrai...,regulatory,norm
4,unchanged,unchanged,April 23,20170501,elbowsafespace,elbowsafespace_0,No harassment of any users for any reason,harass user reason,regulatory,norm
...,...,...,...,...,...,...,...,...,...,...
741541,unchanged,unchanged,April 23,20210101,improvemyfiverrgig,improvemyfiverrgig_0,This subreddit is for requesting and giving ad...,subreddit request give advic improv fiverr gig...,regulatory,norm
741542,unchanged,unchanged,April 23,20180901,sempiternafloreat,sempiternafloreat_0,All posts should at least be loosely related t...,post loos relat mg maidston school,regulatory,norm
741543,unchanged,unchanged,April 23,20180901,sempiternafloreat,sempiternafloreat_1,"NSFW posts should be tagged as such, don't wan...",nsfw post tag want scar yr life,regulatory,norm
741544,unchanged,unchanged,April 23,20180901,sempiternafloreat,sempiternafloreat_2,Try to keep comments and or posts somewhat civ...,try comment post somewhat civil staff call rac...,regulatory,norm


In [34]:
my_data_regulative = my_data[my_data.reg_const_labels == 'regulatory']
my_data_regulative.reset_index(inplace=True)
for i in range(10):
    print(my_data_regulative.text[i])

Making others feel unsafe or disrespected will not be tolerated
There are many other music subreddits for songs you found other ways
Post format: Artist - Title [Genre] Additional Information (Last one is optional)
If a music video contains flashing lights, heavy eyestrain, or other potentially seizure inducing visuals, it has to be flaired
No harassment of any users for any reason
 Moderator discretion of whether or not it was meant as a joke, but just use common sense please
 
Off Topic posts can be removed at moderator discretion, this is an elbow rights Reddit
 Similar posts like knees will be allowed
Things that are not related to Kermitcraft in any way will be deleted from the subreddit


In [38]:
my_data_const = my_data[my_data.reg_const_labels == 'constitutive']
my_data_const.reset_index(inplace=True)
for i in range(20):
    print(my_data_const.text[i])

 It isn't about cars
They aren't funny, so don't make them seem like they are
 This rule also applies to bloated images, or memes that are only 'funny' because the liquify tool was used on them
 Any video uploaded less than a month ago counts as a spoiler
spamming is not allowed, it prevents users from enjoying the full experience of this glorious subreddit
There are some rules
There is only 1 rule: Hate Infinite Warfare
You are not allowed to post anything that isn't connected to either MIA or the Italian language and culture
This is an SFW subreddit
 This sub is for the brothers by the brothers
The creators of this class subreddit
 This also includes low-effort reposts
 This includes both posts and comments
 Thanks 
There's no real point to meme posts here,  they aren't really conductive in this scenario
 This includes 'Izfan/Isfan MUST see this', 'If you dont upvote this you're
This is a Shipping Lanes subreddit
This is a place for humans of ALL ages
 That includes minors
 nobody wa

In [37]:
my_data.to_csv("rule_descriptions_classified.csv", index=False)