In [1]:
import re
import numpy as np
import pandas as pd
from gensim import utils
from markdown import markdown
from bs4 import BeautifulSoup
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.model_selection import KFold
import gensim.parsing.preprocessing as gsp
from sklearn.feature_extraction import text
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict
import scipy as sp
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
import random
random.seed(42)

In [28]:
TRAINING_TYPOLOGY_PATH = "training_typology.csv"
TESTING_TYPOLOGY_PATH = "testing_typology.csv"

training_df = pd.read_csv(TRAINING_TYPOLOGY_PATH)
testing_df = pd.read_csv(TESTING_TYPOLOGY_PATH)
my_data = pd.read_csv("step3_rules.csv")

In [29]:
training_df = training_df[training_df.domain == 'reddit']
testing_df = testing_df[testing_df.domain == 'reddit']

In [30]:
training_df.head()

Unnamed: 0,UID,domain,communityID,lineID,text,lang,ref,timestamp,change_type,codify_stage,...,rule_norm_strategy,position_type,boundary_type,aggregation_type,payoff_type,information_type,communication_type,choice_type,scope_type,textID
794,reddit_misanthropy_2.103_NGOS_20200608_20180719,reddit,misanthropy,2,English only.,en,https://www.reddit.com/r/misanthropy/,20180719,unchanged,typol_redd_diff_20WQ,...,norm,0,0,0,0,0,0,0,0,reddit_misanthropy_2.103_20180719
795,reddit_misanthropy_2.103_zaras_20200607_20180719,reddit,misanthropy,2,English only.,en,https://www.reddit.com/r/misanthropy/,20180719,unchanged,typol_redd_diff_20WQ,...,norm,0,0,0,0,0,0,0,0,reddit_misanthropy_2.103_20180719
796,reddit_misanthropy_2.103_PASCALISJ_20200603_20...,reddit,misanthropy,2,English only.,en,https://www.reddit.com/r/misanthropy/,20180719,unchanged,typol_redd_diff_20WQ,...,norm,0,0,0,0,0,1,1,0,reddit_misanthropy_2.103_20180719
797,reddit_misanthropy_2.103_DHANOAB_20200607_2018...,reddit,misanthropy,2,English only.,en,https://www.reddit.com/r/misanthropy/,20180719,unchanged,typol_redd_diff_20WQ,...,norm,0,0,0,0,0,0,0,0,reddit_misanthropy_2.103_20180719
798,reddit_misanthropy_3.92_NGOS_20200608_20180719,reddit,misanthropy,3,All humans deserve criticism.. We don't play ...,en,https://www.reddit.com/r/misanthropy/,20180719,unchanged,typol_redd_diff_20WQ,...,strategy,0,0,0,0,0,0,0,0,reddit_misanthropy_3.92_20180719


In [32]:
training_df.reset_index(inplace=True)
for i in range(20):
    print(training_df.text[i])

English only.
English only.
English only.
English only.
All humans deserve criticism..  We don't play favorites, this means be objective.
All humans deserve criticism..  We don't play favorites, this means be objective.
All humans deserve criticism..  We don't play favorites, this means be objective.
It is agreed upon that many things are shit and that more importantly, all humans deserve criticism.
It is agreed upon that many things are shit and that more importantly, all humans deserve criticism.
It is agreed upon that many things are shit and that more importantly, all humans deserve criticism.
Your posts should reflect that.
Your posts should reflect that.
Your posts should reflect that.
1) No doxxing, harassment, or bigotry of any kind..
1) No doxxing, harassment, or bigotry of any kind..
1) No doxxing, harassment, or bigotry of any kind..
1) No doxxing, harassment, or bigotry of any kind..
2) Don't participate in or create drama..
2) Don't participate in or create drama..
2) Don'

In [9]:
set(training_df.reg_const)

{'constitutive', 'none', 'regulatory'}

## Data preprocessing
- remove useless characters, whitespace, stopwords  
- lowercasing 
- stemming 
- apply one hot encoding on rule_norm_strategy and reg_const

In [10]:
def strip_html_markdown(s):
    if type(s) not in [int, float] and s is not None:
        return (
            " ".join(
                re.split(
                    "[ _<>,.!|:#*\n\[\]\?]+",
                    " ".join(
                        BeautifulSoup(markdown(s), "html.parser").findAll(text=True)
                    ),
                )
            )
            .lower()
            .strip()
        )


def whitespace_removal(df):
    df.rule_norm_strategy = df.rule_norm_strategy.apply(lambda x: x.strip())
    df.reg_const = df.reg_const.apply(lambda x: x.strip())
    df.domain = df.domain.apply(lambda x: x.strip())
    return df


filters = [
    gsp.strip_tags,
    gsp.strip_punctuation,
    gsp.strip_multiple_whitespaces,
    gsp.strip_numeric,
    gsp.remove_stopwords,
    gsp.strip_short,
    gsp.stem_text,
]


def clean_text(s):
    if type(s) not in [int, float] and s is not None:
        s = s.lower()
        s = utils.to_unicode(s)
        for f in filters:
            s = f(s)
        return s


def randomShuffle(training_df, testing_df):
    return (
        training_df.sample(frac=1).reset_index(drop=True),
        testing_df.sample(frac=1).reset_index(drop=True),
    )

In [33]:
col_list = [
    "text",
    "communityID",
    "IS",
    "reg_const",
    "rule_norm_strategy"
]

training_df, testing_df = (
    whitespace_removal(training_df),
    whitespace_removal(testing_df),
)

training_df, testing_df = randomShuffle(training_df[col_list], testing_df[col_list])

In [12]:
len(training_df), len(testing_df)  # 75%/25%

(3975, 1339)

In [13]:
training_df.columns

Index(['text', 'communityID', 'IS', 'reg_const', 'rule_norm_strategy'], dtype='object')

In [14]:
training_df[0:5]

Unnamed: 0,text,communityID,IS,reg_const,rule_norm_strategy
0,No slut-shaming / kink-shaming..,sexover30,1,regulatory,norm
1,No low-effort image posts.,math,1,regulatory,norm
2,No memes and no compilations of multiple items...,nostalgia,1,regulatory,norm
3,[Please read and follow the Reddit content pol...,delphimurders,1,regulatory,norm
4,Languages.,taiwan,0,none,none


In [34]:
for i in range(20):
    print(training_df.text[i])

English only.
English only.
English only.
English only.
All humans deserve criticism..  We don't play favorites, this means be objective.
All humans deserve criticism..  We don't play favorites, this means be objective.
All humans deserve criticism..  We don't play favorites, this means be objective.
It is agreed upon that many things are shit and that more importantly, all humans deserve criticism.
It is agreed upon that many things are shit and that more importantly, all humans deserve criticism.
It is agreed upon that many things are shit and that more importantly, all humans deserve criticism.
Your posts should reflect that.
Your posts should reflect that.
Your posts should reflect that.
1) No doxxing, harassment, or bigotry of any kind..
1) No doxxing, harassment, or bigotry of any kind..
1) No doxxing, harassment, or bigotry of any kind..
1) No doxxing, harassment, or bigotry of any kind..
2) Don't participate in or create drama..
2) Don't participate in or create drama..
2) Don'

## Data transformation
- generate corpus 
- transform the corpus to a normalized tf-idf representation

In [17]:
def corpusGen(df):
    return (
        df.text.apply(strip_html_markdown)
        .apply(lambda x: clean_text(x))
        .astype(str)
        .tolist()
    )


training_df['clean_text'] = corpusGen(training_df)
testing_df['clean_text'] = corpusGen(testing_df)
IS_corpus = training_df['clean_text'].tolist() + testing_df['clean_text'].tolist()
IS_source = training_df['text'].tolist() + testing_df['text'].tolist()

In [18]:
training_df[0:5]

Unnamed: 0,text,communityID,IS,reg_const,rule_norm_strategy,clean_text
0,No slut-shaming / kink-shaming..,sexover30,1,regulatory,norm,slut shame kink shame
1,No low-effort image posts.,math,1,regulatory,norm,low effort imag post
2,No memes and no compilations of multiple items...,nostalgia,1,regulatory,norm,meme compil multipl item starter pack
3,[Please read and follow the Reddit content pol...,delphimurders,1,regulatory,norm,read follow reddit content polici
4,Languages.,taiwan,0,none,none,languag


In [19]:
vec_word = TfidfVectorizer()
X_vec_word = vec_word.fit_transform(IS_corpus)
vec_char = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)
X_vec_char = vec_char.fit_transform(IS_source)
X_IS = sp.sparse.hstack([X_vec_word, X_vec_char])

In [20]:
X_IS.shape

(5314, 18807)

In [21]:
def get_precision_recall_f1(l, category, accuracy):
    precision_recall_df = pd.DataFrame(
        l, columns=["Precision", "Recall", "F1 Score", "Support"]
    )
    precision_recall_df.drop("Support", axis=1, inplace=True)
    precision_recall_df.insert(0, "Type", category)
    precision_recall_df.insert(1, "Accuracy", accuracy)
    return precision_recall_df

## IS Detector
- train clf_IS
- predict IS column on texts from wow, minecraft and reddit

In [22]:
l_IS = []
acc_IS = []

kfold = KFold(n_splits=10, shuffle = True, random_state=42)
y_IS = training_df.IS.values.tolist() + testing_df.IS.values.tolist()
clf_IS = OneVsRestClassifier(LinearSVC())

accuracy = cross_val_score(clf_IS, X_IS, y_IS, cv=kfold).mean()
y_IS_pred = cross_val_predict(clf_IS, X_IS, y_IS, cv=kfold)

acc_IS.append(accuracy)
l_IS.append(precision_recall_fscore_support(y_IS, y_IS_pred, average='weighted'))
IS_scores = get_precision_recall_f1(l_IS, "IS", acc_IS)
IS_scores

Unnamed: 0,Type,Accuracy,Precision,Recall,F1 Score
0,IS,0.909667,0.907874,0.909673,0.907982


## Rule Typology Detector
- filter out texts with IS = 0
- train on institutional texts 

In [23]:
def domain_selection(train, test):
    
    IS_train_df = train[(train["IS"] == 1)]
    IS_test_df = test[(test['IS'] == 1)]
    
    IS_train_df = IS_train_df[(IS_train_df['reg_const'] != 'none') & (IS_train_df['rule_norm_strategy'] != 'none')]
    IS_test_df = IS_test_df[(IS_test_df['reg_const'] != 'none') & (IS_test_df['rule_norm_strategy'] != 'none')]

    return IS_train_df, IS_test_df
        
    
def gen_X(train,test):
    
    rules_corpus = train['clean_text'].tolist() + test['clean_text'].tolist()
    rules_source = train['text'].tolist() + test['text'].tolist()
    
    word_rules = vec_word.fit_transform(rules_corpus)
    char_rules = vec_char.fit_transform(rules_source)
    X_rules = sp.sparse.hstack([word_rules, char_rules])

    return X_rules


clf_rules = OneVsRestClassifier(LinearSVC())

def get_scores(train, test, X_rules, IS_detector_scores):    
    categories = ['reg_const','rule_norm_strategy']
    l_rules = []
    acc_rules = []
    for c in categories:
        y_rules = train[c].values.tolist() + test[c].values.tolist()

        accuracy = cross_val_score(clf_rules, X_rules, y_rules, cv=kfold).mean()
        y_pred = cross_val_predict(clf_rules, X_rules, y_rules, cv=kfold)
        
        acc_rules.append(accuracy)
        l_rules.append(precision_recall_fscore_support(y_rules, y_pred, average='weighted'))
        
    precision_recall_df = get_precision_recall_f1(l_rules, categories, acc_rules)
    
    return pd.concat([IS_detector_scores, precision_recall_df])

In [21]:
# def label_data(train, test, X_rules, domain):
#     categories = ['reg_const','rule_norm_strategy']
#     for c in categories:
#         y_rules = train[c].values.tolist() + test[c].values.tolist()
#         clf_rules.fit(X_rules, y_rules)
#         wow_unlabeled_IS[c] = clf_rules.predict(X_wow)
#         minecraft_unlabeled_IS[c] = clf_rules.predict(X_minecraft)
#         reddit_unlabeled_IS[c] = clf_rules.predict(X_reddit)
#         coded_unlabeled_IS[c] = clf_rules.predict(X_coded)
        
#     wow_labeled = wow.append(wow_unlabeled_IS)
#     minecraft_labeled = minecraft.append(minecraft_unlabeled_IS)
#     reddit_labeled = reddit.append(reddit_unlabeled_IS)
#     coded_labeled = coded.append(coded_unlabeled_IS)
        
#     wow_labeled.to_csv("wow_" + domain, index=False)
#     minecraft_labeled.to_csv("minecraft_" + domain, index=False)
#     reddit_labeled.to_csv("reddit_" + domain, index=False)
#     coded_labeled.to_csv("coded_" + domain, index=False)
    
#     return wow_labeled, minecraft_labeled, reddit_labeled, coded_labeled

In [24]:
reddit_train_df, reddit_test_df = domain_selection(training_df, testing_df)
reddit_X_rules = gen_X(reddit_train_df, reddit_test_df)
get_scores(reddit_train_df, reddit_test_df, reddit_X_rules, IS_scores) 

Unnamed: 0,Type,Accuracy,Precision,Recall,F1 Score
0,IS,0.909667,0.907874,0.909673,0.907982
0,reg_const,0.90303,0.893184,0.903032,0.894284
1,rule_norm_strategy,0.893009,0.879063,0.893009,0.880374


## My Data Preprocessing

In [25]:
my_data.drop(columns=['lineID', 'domain', 'ref', 'source'], inplace=True)
my_data['clean_text'] = corpusGen(my_data)
my_data.dropna(subset='text', inplace=True)

In [26]:
my_data

Unnamed: 0,change_type,before/after,timestamp,timestamp_rule,communityID,ruleID,text,clean_text
0,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_0,Making others feel unsafe or disrespected will...,make feel unsaf disrespect toler
1,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_1,There are many other music subreddits for song...,music subreddit song wai
2,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_2,Post format: Artist - Title [Genre] Additional...,post format artist titl genr addit inform option
3,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_3,"If a music video contains flashing lights, hea...",music video contain flash light heavi eyestrai...
4,unchanged,unchanged,April 23,20170501,elbowsafespace,elbowsafespace_0,No harassment of any users for any reason. Mod...,harass user reason moder discret meant joke us...
...,...,...,...,...,...,...,...,...
358021,unchanged,unchanged,April 23,20210101,improvemyfiverrgig,improvemyfiverrgig_0,This subreddit is for requesting and giving ad...,subreddit request give advic improv fiverr gig...
358022,unchanged,unchanged,April 23,20180901,sempiternafloreat,sempiternafloreat_0,All posts should at least be loosely related t...,post loos relat mg maidston school
358023,unchanged,unchanged,April 23,20180901,sempiternafloreat,sempiternafloreat_1,"NSFW posts should be tagged as such, don't wan...",nsfw post tag want scar yr life
358024,unchanged,unchanged,April 23,20180901,sempiternafloreat,sempiternafloreat_2,Try to keep comments and or posts somewhat civ...,try comment post somewhat civil staff call rac...


In [27]:
for i in range(20):
    print(my_data.clean_text[i])

make feel unsaf disrespect toler
music subreddit song wai
post format artist titl genr addit inform option
music video contain flash light heavi eyestrain potenti seizur induc visual flair
harass user reason moder discret meant joke us common sens
topic post remov moder discret elbow right reddit isn car similar post like knee allow


KeyError: 6

In [None]:
word_my_data = vec_word.transform(my_data.clean_text)
char_my_data = vec_char.transform(my_data.text)
X_my_data = sp.sparse.hstack([word_my_data, char_my_data]) 

## My data labelling

In [44]:
categories = ['reg_const','rule_norm_strategy']
for c in categories:
    y_rules = reddit_train_df[c].values.tolist() + reddit_test_df[c].values.tolist()
    clf_rules.fit(reddit_X_rules, y_rules)
    my_data[c + "_labels"] = clf_rules.predict(X_my_data)

In [46]:
my_data

Unnamed: 0,change_type,before/after,timestamp,timestamp_rule,communityID,ruleID,text,clean_text,reg_const_labels,rule_norm_strategy_labels
0,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_0,Making others feel unsafe or disrespected will...,make feel unsaf disrespect toler,regulatory,norm
1,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_1,There are many other music subreddits for song...,music subreddit song wai,regulatory,norm
2,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_2,Post format: Artist - Title [Genre] Additional...,post format artist titl genr addit inform option,regulatory,norm
3,unchanged,unchanged,April 23,20200801,weeklydiscoveries,weeklydiscoveries_3,"If a music video contains flashing lights, hea...",music video contain flash light heavi eyestrai...,regulatory,norm
4,unchanged,unchanged,April 23,20170501,elbowsafespace,elbowsafespace_0,No harassment of any users for any reason. Mod...,harass user reason moder discret meant joke us...,regulatory,norm
...,...,...,...,...,...,...,...,...,...,...
358021,unchanged,unchanged,April 23,20210101,improvemyfiverrgig,improvemyfiverrgig_0,This subreddit is for requesting and giving ad...,subreddit request give advic improv fiverr gig...,regulatory,norm
358022,unchanged,unchanged,April 23,20180901,sempiternafloreat,sempiternafloreat_0,All posts should at least be loosely related t...,post loos relat mg maidston school,regulatory,norm
358023,unchanged,unchanged,April 23,20180901,sempiternafloreat,sempiternafloreat_1,"NSFW posts should be tagged as such, don't wan...",nsfw post tag want scar yr life,regulatory,norm
358024,unchanged,unchanged,April 23,20180901,sempiternafloreat,sempiternafloreat_2,Try to keep comments and or posts somewhat civ...,try comment post somewhat civil staff call rac...,regulatory,norm


In [52]:
my_data_regulative = my_data[my_data.reg_const_labels == 'regulatory']
my_data_regulative.reset_index(inplace=True)
for i in range(10):
    print(my_data_regulative.text[i])

Making others feel unsafe or disrespected will not be tolerated.
There are many other music subreddits for songs you found other ways.
Post format: Artist - Title [Genre] Additional Information (Last one is optional).
If a music video contains flashing lights, heavy eyestrain, or other potentially seizure inducing visuals, it has to be flaired.
No harassment of any users for any reason. Moderator discretion of whether or not it was meant as a joke, but just use common sense please. 
Off Topic posts can be removed at moderator discretion, this is an elbow rights Reddit. It isn't about cars. Similar posts like knees will be allowed
Things that are not related to Kermitcraft in any way will be deleted from the subreddit.
If found self promoting or advertising, then your post will be deleted.
No memes here
Swearing in posts/comments can result in a mute or a ban from the subreddit if repeated constantly.


In [50]:
my_data_const = my_data[my_data.reg_const_labels == 'constitutive']
my_data_const.reset_index(inplace=True)
for i in range(10):
    print(my_data_const.text[i])

They aren't funny, so don't make them seem like they are. This rule also applies to bloated images, or memes that are only 'funny' because the liquify tool was used on them.
Some people may not have seen a video of mine. Any video uploaded less than a month ago counts as a spoiler. Also, if something is NSFW for whatever reason, make sure it applies to this sub AND is marked NSFW.
spamming is not allowed, it prevents users from enjoying the full experience of this glorious subreddit
There are some rules. 
There is only 1 rule: Hate Infinite Warfare
You are not allowed to post anything that isn't connected to either MIA or the Italian language and culture.
This is an SFW subreddit.
Having the freedom of posting fucked up shit is protected by the bro code of not telling anyone that this sub exists. people are literally giving fapdates. keep this bish a secret. This sub is for the brothers by the brothers. 
The creators of this class subreddit
There's no real point to meme posts here,  th

In [53]:
my_data.to_csv("rule_descriptions_classified.csv")