**Import statements**

In [27]:
import pandas as pd
import datetime
import numpy as np
import nltk
import re
import os
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

### Loading the dataset

The dataset was obtained from a script which uses two Reddit APIs to obtain posts and then pull information about each of them.

In [2]:
df = pd.read_csv(os.path.join("../reddit_scraper/reddit_posts.csv"))

### Cleaning up rows

Remove from dataset all rows without a body, and rows with tags outside the verdicts (mod posts). Also, "Not enough info" posts will be removed as well, as they're considered invalid from the submission point of view.

In [3]:
nan = np.nan
clean_df = df.query("body not in ['[deleted]', '[removed]'] & verdict not in ['TL;DR', 'UPDATE', @nan, 'Talk ENDED', 'Open Forum', 'Mods Needed!', 'META', 'Not enough info']")

In [4]:
is_edited = []

for i in clean_df.edited.values:
    if i == 'False':
        column_value = False
    else:
        column_value = True
    is_edited.append(column_value)

In [5]:
clean_df.insert(8, 'is_edited', is_edited)

In [6]:
clean_df = clean_df.drop(columns=["edited"])

In [7]:
clean_df.verdict.value_counts()

Not the A-hole     23014
Asshole             4397
No A-holes here     1753
Everyone Sucks      1242
Name: verdict, dtype: int64

In [8]:
X = clean_df.body.values

In [9]:
y = clean_df.verdict.values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
(y_train == "Not the A-hole").sum()

18406

In [12]:
(y_train == "Asshole").sum()

3485

In [13]:
(y_train == "Everyone Sucks").sum()

1006

In [14]:
(y_train == "No A-holes here").sum()

1427

**Tokenize**

In [None]:
def tokenize(text):
    lemmatizer = WordNetLemmatizer()
    
    urls = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    detected_urls = re.findall(urls, text)
    
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
        
    words = word_tokenize(text)
    stop_words = stopwords.words("english")
    words = [x for x in words if x not in stop_words]
    
    clean_tokens = []
    for word in words:
        clean_token = lemmatizer.lemmatize(word).strip().lower()
        clean_tokens.append(clean_token)
        
    return clean_tokens

**Model**

In [30]:
pipeline = Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer(use_idf = True)),
                ('clf', DecisionTreeClassifier())
    ])

parameters = {
        "clf__criterion": ["gini", "entropy"],
        "clf__min_samples_split": [2, 10, 20]
    }

model = GridSearchCV(pipeline, parameters, verbose = 10)

In [31]:
model.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5; 1/6] START clf__criterion=gini, clf__min_samples_split=2...............
[CV 1/5; 1/6] END clf__criterion=gini, clf__min_samples_split=2; total time= 2.8min
[CV 2/5; 1/6] START clf__criterion=gini, clf__min_samples_split=2...............


KeyboardInterrupt: 

In [32]:
y_pred = model.predict(X_test)

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
model.best_params_

In [86]:
print(classification_report(y_test, y_pred))

                 precision    recall  f1-score   support

        Asshole       0.18      0.21      0.19       898
 Everyone Sucks       0.05      0.06      0.05       243
No A-holes here       0.07      0.08      0.08       352
 Not the A-hole       0.76      0.72      0.74      4589

       accuracy                           0.58      6082
      macro avg       0.26      0.27      0.26      6082
   weighted avg       0.61      0.58      0.59      6082



In [102]:
print('The accuracy for mode model is:', accuracy_score(y_test, y_pred))
print('The f1 score for the model model is:', f1_score(y_test, y_pred, average='weighted'))

The accuracy for mode model is: 0.32387706855791965
The f1 score for the model model is: 0.3227273293109279


In [103]:
test="I (29F) work in a tech company that delivers supermarket goods. I want to become Minister of Health in the future, so I'm working on a Data Science Nanodegree to learn more stuff, as well as working on my current job to get more experience. I have a project to deliver before April ends, and I've been struggling to finish it as I didn't pay much attention in my machine learning classes, so I asked a friend for help interpreting my results, and he absolutely refused to help me. I know I asked him at almost 11PM but that's what friends do, right? So Reddit, AITA?"

In [104]:
model.predict([test])

array(['Everyone Sucks'], dtype=object)

In [105]:
#asshole test
test1= "MIL and SFIL's first language is Arabic and they both prefer it to English, though their english is very good. During the entire time my parents and I have known them, they have always had side conversations in Arabic and none of us were ever bothered by it. My parents raised me to be inclusive and accepting. It is to the point that MIL doesn't even talk to my kids much because she doesn't like English, but as the white person who married into a POC family, I really try to be understanding and stay in my lane.Recently my parents attended a soccer game for my brother. He plays on a highly competitive travel team, as does SFIL's son. That day he was set to play against SFIL's son's team. Now my family is crazy about sports and SFIL used to be a professional soccer (football for them) player in his home country. The game was intense and SFIL was coaching his son from the sidelines in Arabic. My father was upset and felt that it was an unfair advantage, especially when SFIL used to be a professional, and asked him to speak English. When SFIL refused, my dad went to the ref who said SFIL was fine.After the game MIL complained to multiple people that my parents were being racist. My mom was hurt by this and I confronted MIL. MIL said she doesn't care about the context, her husband can speak Arabic whenever he wants to. I pointed out that it was about giving his son an unfair advantage over the other boys and MIL said that doesn't matter and in a condescending tone said whenever he wants to. SFIL pointed out that they used to mispronounce MIL's name, which to be fair she never bothered to correct them. When he corrected them they did stop, but occasionally slip up.I said that I believe with all my heart that my parents aren't racist and I will stand by them. I said I respect Arabic 100%, but he was giving his son an advantage and his son is known to play a bit dirty, so there is that. My husband is on my side, but I feel they are pretty annoyed with me."

In [106]:
model.predict([test1])

array(['No A-holes here'], dtype=object)

In [107]:
#nta
test2 = "I know there’s one of these posted fairly often, but this is a double whammy.I was on a 10 hour flight from US to Europe. I am 6’5” and have damaged knees from soccer, so leg room and the ability to get up and walk frequently is a must.I booked an aisle seat in economy plus for the above reasons. Normally I try to get bulkhead seats or business class if reasonable but this was a last minute flight for a death in the family.The other two passengers in our row of 3 were a woman and her toddler. The toddler was screaming bloody murder in the waiting area and continued to once seated in the middle seat, leaning away from mom and against me while doing so. I have very sensitive hearing due to ruptured eardrums, so I put in noise canceling earphones but those can only do so much when the source of the sound is less than a foot away. A flight attendant took notice and asked me for my drink order prior to takeoff, which I guess shows how visibly uncomfortable I was.Even in economy plus there’s the chance of the person in front of you reclining their seat, which of course happened after we reaching cruising altitude and my knees were starting to burn.I went to walk and when I came back the woman asked me if I would switch to the window seat. I said no and explained I’d be getting up repeatedly throughout the flight and didn’t want to have to climb over them. The flight was overnight so I’d be waking them up as well.The same flight attendant caught me on my next walk, said there was a space in business class and asked if I’d like to move up. We were close enough to my seat that the woman heard and asked if they could be upgraded instead. The FA just shrugged but I said I’d take it. It was an aisle seat and the middle was empty, so the two of them could have taken it too.I couldn’t refuse the chance to get a bit of sleep. I had a ~3 hour train ride and then tram after landing, I’d need to stay awake after 2 hours on the train and then immediately go to a funeral because it was late morning when I arrived at my mom’s house.While in line for customs the woman was a few people behind me, she called me a prick and kept raising her voice to make comments about me to her son like “that mean lady wouldn’t let you have the nice seats”. I don’t know why it was offered to me first, but AITA?Edit: if you think I’ve posted this for validation or whatever else, I invite you to get yourself yelled at and called names in a customs hall in a foreign country and see how well you feel afterwards."

In [108]:
model.predict([test2])

array(['Asshole'], dtype=object)

In [109]:
#esh
test3="My husband(29M) and I(29M) are in the process of getting a divorce. I’ve been seeing a friend(29M) of mine romantically for a little while and last night he stayed the night at my house which use to be my husband and I’s house.I have 3 daughters, 4, 5 and 6 who all know better than to come in my room before I’ve woken up. My husband and I have made that clear to them since they could walk and get out of bed by themselves. I sleep naked and I obviously don’t want them walking in. If they really need something or there is an emergency, they still know they have to knock before coming in.I usually lock my bedroom door and last night I forgot to. My ex-husband comes into my house because he still had a key and he was coming by to pick up the girls and take them to his house for the week. He came much earlier than expected because he wanted all of us to have breakfast together which would be fine on any morning except today.When my daughters wake up they go down stairs and watch TV on the couch until I wake up and make them breakfast. When my ex walks into the house he asks them where I am and they tell him I’m still sleeping. He walks into my bedroom and sees me naked with the other guy. He goes complete ballistic that I should lock the door and that I have zero respect for my kids by having someone stay the night. Btw, this guy has been my friend since I was a kid and my daughters love him like family and they know I’m seeing him.He tells me the girls could have walked in and saw us. Not once has my daughters ever walked in without knocking except for when they were first learning to knock first.Also, he’s literally arguing with me while I’m naked in bed, he doesn’t even have the decency to let me get dressed first before having this conversation.So aita?"

In [110]:
model.predict([test3])

array(['No A-holes here'], dtype=object)

In [111]:
test4="I am crying, AITA?"

In [112]:
model.predict([test4])

array(['No A-holes here'], dtype=object)