# Affective Polarisation Classifier

## Load Required Packages

In [3]:
# import own functions written in moralisation classifier notebook (NB II) saved to .py
from finalproject_functions import reddit_preprocessing, remove_bad_rows, MyTokenizer

# import other required packages:
import pandas as pd
import regex
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download("punkt")
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV,
)
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
import emoji
import warnings
import gensim
import gensim.downloader as api
import embeddingvectorizer
from embeddingvectorizer import EmbeddingCountVectorizer, EmbeddingTfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load Datasets Containing Reddit Comments

In [4]:
# load labelled & unlabelled comments:
labelled_comments = pd.read_excel("labs_labelled_comments.xlsx")
unlabelled_comments = pd.read_csv("unlabelled_comments.csv", delimiter = ",")

## Data Preprocessing

In [5]:
# store regular expressions in list to remove desired characters
regex_list = [
      r"&[^;]+;", #remove html character escapes (&amp etc.)
      r"</?\w[^>]*>", #remove html tags 
      r"https?://[\w\.]+\b|www\.[\w\.]+\b", # remove links to websites
      r"\s(www.\S+)" # remove links to websites
      r"\*"]

In [6]:
# store original length in object n
n = len(labelled_comments)

### Apply pre-written preprocessing functions

In [7]:
# remove unwanted regular expressions stored in regex_list
labelled_comments["comment_pr"] = labelled_comments["comment"].apply(lambda x: reddit_preprocessing(x, regex_list))

In [8]:
# remove duplicates and NAs
labelled_comments = remove_bad_rows(labelled_comments, "comment")

In [9]:
print(f"{n-len(labelled_comments)} comments were removed after data cleaning")

2 comments were removed after data cleaning


## Remove Social Identity-Related Terms from Stopword List:
As social identity mechanisms are thought to underly affective polarisation (e.g., Iyengar et al., 2012), in coding the comments, I paid attention to words indicative of social identity dynamics, namely "us", "we", "them", "they". Therefore, these terms are removed from the stopword list (see van Atteveld et al., 2022).

In [11]:
# store SI-related words in word_list
word_list = ["us", "we", "they", "them"]

# create filtered stopwords list:
stopwords_filtered = [word for word in word_list if word not in stopwords.words('english')] 


#Aproach: see https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python)

## Split the Data

In [12]:
# Train Test Split using the preprocessed comments column and the overall morality label. 
# X_test_f and y_test_f are set aside to test the final model.
X_train, X_test_f, y_train, y_test_f = train_test_split(
    labelled_comments["comment_pr"],
    labelled_comments["AP_label"],
    test_size=0.2,
    random_state=99)

# Split the training data again, this time with test size = .25 to achieve a final split of 
# 60 training data; 20 validation data (this is where baseline is tested on); 20 final testing data (best model testing)
X_train_sec, X_val, y_train_sec, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.25,
    random_state=99)

## Classifier configurations:

#### Inspect balance of labelled data to determine best adjustment of hyperparameters:

In [13]:
labelled_comments.groupby(["AP_label"]).count() 

Unnamed: 0_level_0,Unnamed: 0,post_id,comment_id,AP_inciv,AP_group,comment,comment_pr
AP_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,784,784,784,784,784,784,784
1,213,213,213,213,213,213,213


As the classes are unbalanced (under-representations of comments labelled as containing indicators of affective polarisation) the classifiers are adjusted (setting parameter class_weight to "balanced" by default)

#### Define Tokenizer:

In [14]:
# assign the previously saved class "MyTokenizer()" to the object "mytokenizer" to use in the configurations
mytokenizer=MyTokenizer()

#### Store Configurations

In [15]:
# Store configurations in "configurations" object
configurations_AP=[('NB_CountV', CountVectorizer(tokenizer = mytokenizer.tokenize,
                                              stop_words = stopwords_filtered), ComplementNB()),
('NB_Tfidf', TfidfVectorizer(tokenizer = mytokenizer.tokenize,
                             stop_words = stopwords_filtered), ComplementNB()), 
('LR_CountV', CountVectorizer(tokenizer = mytokenizer.tokenize,
                              stop_words = stopwords_filtered), LogisticRegression(solver='liblinear',
                                                                                           class_weight = 'balanced')),
('LR_Tfidf', TfidfVectorizer(tokenizer = mytokenizer.tokenize,
                             stop_words = stopwords_filtered), LogisticRegression(solver='liblinear', 
                                                                                          class_weight = 'balanced')),
('SVM_CountV', CountVectorizer(tokenizer = mytokenizer.tokenize,
                                              stop_words = stopwords_filtered), SVC(gamma = 'scale', 
                                                                                            class_weight = 'balanced')),
('SVM_Tfidf', TfidfVectorizer(tokenizer = mytokenizer.tokenize,
                             stop_words = stopwords_filtered),SVC(gamma = 'scale', 
                                                                          class_weight = 'balanced')), 
('RF_CountV', CountVectorizer(tokenizer = mytokenizer.tokenize,
                                              stop_words = stopwords_filtered), RandomForestClassifier(class_weight = 'balanced')), 
('RF_Tfidf', TfidfVectorizer(tokenizer = mytokenizer.tokenize,
                             stop_words = stopwords_filtered) , RandomForestClassifier(class_weight = 'balanced'))
]

#### Classification& Report 

In [16]:
# Classification function 
def classification(x):
  for name, vectorizer, classifier in x:
      trans_X_train_sec = vectorizer.fit_transform(X_train_sec)
      trans_X_val = vectorizer.transform(X_val)
      classifier.fit(trans_X_train_sec, y_train_sec)
      pred_y_sm = classifier.predict(trans_X_val)
      print(f"Classification Report for {name}:\n")
      print(classification_report(y_val, pred_y_sm))
      print("\n")

In [17]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
classification(configurations_AP)



Classification Report for NB_CountV:

              precision    recall  f1-score   support

           0       0.78      0.90      0.84       152
           1       0.38      0.19      0.25        48

    accuracy                           0.73       200
   macro avg       0.58      0.54      0.54       200
weighted avg       0.68      0.73      0.69       200







Classification Report for NB_Tfidf:

              precision    recall  f1-score   support

           0       0.77      0.98      0.86       152
           1       0.50      0.06      0.11        48

    accuracy                           0.76       200
   macro avg       0.63      0.52      0.49       200
weighted avg       0.70      0.76      0.68       200







Classification Report for LR_CountV:

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       152
           1       0.48      0.42      0.44        48

    accuracy                           0.75       200
   macro avg       0.65      0.64      0.64       200
weighted avg       0.74      0.75      0.74       200







Classification Report for LR_Tfidf:

              precision    recall  f1-score   support

           0       0.83      0.84      0.83       152
           1       0.47      0.46      0.46        48

    accuracy                           0.74       200
   macro avg       0.65      0.65      0.65       200
weighted avg       0.74      0.74      0.74       200







Classification Report for SVM_CountV:

              precision    recall  f1-score   support

           0       0.81      0.93      0.86       152
           1       0.56      0.29      0.38        48

    accuracy                           0.78       200
   macro avg       0.68      0.61      0.62       200
weighted avg       0.75      0.78      0.75       200







Classification Report for SVM_Tfidf:

              precision    recall  f1-score   support

           0       0.77      0.99      0.87       152
           1       0.67      0.08      0.15        48

    accuracy                           0.77       200
   macro avg       0.72      0.54      0.51       200
weighted avg       0.75      0.77      0.69       200







Classification Report for RF_CountV:

              precision    recall  f1-score   support

           0       0.76      1.00      0.86       152
           1       0.00      0.00      0.00        48

    accuracy                           0.76       200
   macro avg       0.38      0.50      0.43       200
weighted avg       0.58      0.76      0.66       200





  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for RF_Tfidf:

              precision    recall  f1-score   support

           0       0.76      1.00      0.86       152
           1       0.00      0.00      0.00        48

    accuracy                           0.76       200
   macro avg       0.38      0.50      0.43       200
weighted avg       0.58      0.76      0.66       200





  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Gridsearch on Best Performing Models: 

The following models were selected to perform a gridsearch, as their f1-score was the highest (and closest to 0.5). Because both precision and recall are deemed central for the classification of affective polarisation indicators in online language, LR_CountV seems to be the better model overall, however, given different hyperparameters this may change.
- LR_Count
- LR_Tfidf

#### Gridsearch for LR_Count:

In [18]:
('LR_CountV', CountVectorizer(tokenizer = mytokenizer.tokenize,
                              stop_words = stopwords_filtered), LogisticRegression(solver='liblinear',
                                                                                           class_weight = 'balanced')),


# Define pipeline with the model, defining the vectorizer and classifier 
LR_CountV_pipe = Pipeline(
    steps=[("vectorizer", CountVectorizer(tokenizer = mytokenizer.tokenize,
                              stop_words = stopwords_filtered)),
        ("classifier", LogisticRegression(solver='liblinear', class_weight = 'balanced')),
            ]
           )
# Specify ranges of values for hyperparameters to test for the model supplied to the pipeline
LR_CountV_grid = {
    "vectorizer__ngram_range": [(1, 1), (1, 2)],
    "vectorizer__max_df": [0.5, 0.75, 1.0],
    "vectorizer__min_df": [0, 5, 10],
    "classifier__C":[0.01, 0.1, 1, 10, 100], 
    "classifier__penalty":["l1", "l2"]
}

# perform gridsearch
search_LR_CountV = GridSearchCV(
    estimator = LR_CountV_pipe, n_jobs=-1, param_grid=LR_CountV_grid, scoring="f1", cv=10)
search_LR_CountV.fit(X_train, y_train)



In [19]:
print(f"Best parameters: {search_LR_CountV.best_params_}")
print(f"Best score: {round(search_LR_CountV.best_score_,3)}")

Best parameters: {'classifier__C': 0.01, 'classifier__penalty': 'l2', 'vectorizer__max_df': 0.5, 'vectorizer__min_df': 0, 'vectorizer__ngram_range': (1, 1)}
Best score: 0.443


In [20]:
LR_CountV_model = search_LR_CountV.best_estimator_

#### Gridsearch for LR_Tfidf

In [21]:
LR_Tfidf_GS_pipe = Pipeline(
    steps=[("vectorizer", TfidfVectorizer(tokenizer = mytokenizer.tokenize,
                                          stop_words = stopwords_filtered)),
        ("classifier", LogisticRegression(solver='liblinear',
                                          class_weight = 'balanced')),
            ]
           )
LR_Tfidf_grid = {
    "vectorizer__ngram_range": [(1, 1), (1, 2)],
    "vectorizer__max_df": [0.5, 0.75, 1.0],
    "vectorizer__min_df": [0, 5, 10],
    "classifier__C":[0.01, 0.1, 1, 10, 100], 
    "classifier__penalty":["l1", "l2"]
}
search_LR_Tfidf = GridSearchCV(
    estimator = LR_Tfidf_GS_pipe, n_jobs=-1, param_grid=LR_Tfidf_grid, scoring="f1", cv=10)
search_LR_Tfidf.fit(X_train, y_train)



In [22]:
print(f"Best parameters: {search_LR_Tfidf.best_params_}")
print(f"Best score: {round(search_LR_Tfidf.best_score_,3)}")

Best parameters: {'classifier__C': 0.01, 'classifier__penalty': 'l2', 'vectorizer__max_df': 0.5, 'vectorizer__min_df': 0, 'vectorizer__ngram_range': (1, 1)}
Best score: 0.457


In [23]:
# Save best model with best hyperparameter settings
LR_Tfidf_model = search_LR_Tfidf.best_estimator_

Based on the rounding, it seems that the LR_TFidf is the better classifier, but both perform approximately the same. Therefore, Word Embeddings will be added to both, to see if this makes a difference 

## Adding Word Embeddings to Best Performing Models

In [24]:
# Download pre-trained word embeddings on Wikipedia Corpus
wv = api.load('glove-wiki-gigaword-300')
wv_model = dict(zip(wv.index_to_key, wv.vectors))



Adding best parameters into the models

In [25]:
# Store the pre-trained word embeddings and Count or Tfidf vectorizers in the configurations list to get the 
# classification report in the next step
configurations_AP_emb=[ 
('LR_CountV', embeddingvectorizer.EmbeddingCountVectorizer(wv_model, operator='mean'),
 LogisticRegression(solver='liblinear',
                    class_weight = 'balanced')),
('LR_Tfidf', embeddingvectorizer.EmbeddingTfidfVectorizer(wv_model, operator='mean'),
 LogisticRegression(solver='liblinear', class_weight = 'balanced')),
('SVM_CountV', embeddingvectorizer.EmbeddingCountVectorizer(wv_model, operator='mean'),
 SVC(gamma = 'scale', class_weight = 'balanced')),
('SVM_Tfidf', embeddingvectorizer.EmbeddingTfidfVectorizer(wv_model, operator='mean'),
 SVC(gamma = 'scale', class_weight = 'balanced')), 
('RF_CountV', embeddingvectorizer.EmbeddingCountVectorizer(wv_model, operator='mean'),
 RandomForestClassifier(class_weight = 'balanced')), 
('RF_Tfidf', embeddingvectorizer.EmbeddingTfidfVectorizer(wv_model, operator='mean'),
 RandomForestClassifier(class_weight = 'balanced'))
]

In [26]:
classification (configurations_AP_emb)

Classification Report for LR_CountV:

              precision    recall  f1-score   support

           0       0.84      0.72      0.78       152
           1       0.39      0.58      0.47        48

    accuracy                           0.69       200
   macro avg       0.62      0.65      0.62       200
weighted avg       0.74      0.69      0.70       200



Classification Report for LR_Tfidf:

              precision    recall  f1-score   support

           0       0.83      0.75      0.79       152
           1       0.40      0.52      0.45        48

    accuracy                           0.69       200
   macro avg       0.61      0.64      0.62       200
weighted avg       0.73      0.69      0.71       200



Classification Report for SVM_CountV:

              precision    recall  f1-score   support

           0       0.86      0.72      0.78       152
           1       0.41      0.62      0.50        48

    accuracy                           0.69       200
   macro a

Using the word embeddings, the best models identified are SVM_Tfidf, achieving f1=0.52. With BOW and gridsearch, the best model is LR_Tfidf with f1=0.46. Therefore, the final baseline model selected is SVM using word embeddings and a count vectorizer. 


## Fitting the embedding model

In [37]:
# Store the best model as identified in the previous step in the pipe
em_pipe = Pipeline([('SVM_Tfidf', embeddingvectorizer.EmbeddingTfidfVectorizer(wv_model, operator='mean')),
                    ('svm_svc', 
                     SVC(gamma = 'scale', class_weight = 'balanced'))
])

# fit the model to the training data
em_pipe.fit(X_train_sec, y_train_sec)

# predict the labels of the final set aside testing data
y_pred = em_pipe.predict(X_test_f)

# get model performance metrics
print(metrics.classification_report(y_test_f, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.76      0.82       164
           1       0.35      0.58      0.44        36

    accuracy                           0.73       200
   macro avg       0.62      0.67      0.63       200
weighted avg       0.80      0.73      0.75       200



## Clean Unlabelled Data

This step was added after comparing all models (including BERT). Before applying the best model to label the comments, this data needs to be preprocessed:

In [39]:
# apply preprocessing functions removing regular expressions stored in the regex list
unlabelled_comments["comment"] = unlabelled_comments["comment"].apply(lambda x: reddit_preprocessing(x, regex_list))

In [40]:
# apply function to remove duplicate comments and missing values.
unlabelled_comments = remove_bad_rows(unlabelled_comments, "comment")
print(f"{n_u-len(unlabelled_comments)} titles were removed")

0 titles were removed


## Final Model Prediction:

In [41]:
# use model stored as "em_pipe" to predict the labels in the unlabelled comments dataset
unlabelled_comments["AP_labels"] = em_pipe.predict(unlabelled_comments["comment"])
unlabelled_comments.head()

Unnamed: 0.1,Unnamed: 0,post_id,comment_id,comment,AP_labels
0,0,flgxp,c1gtf8n,I don't advocate the death penalty. But this ...,0
1,1,flgxp,c1gtjcm,Did anybody else see [this video](/david-neiwe...,0
2,2,flgxp,c1gtlbj,I've been following this case for a while. I'm...,0
3,4,flgxp,c1gtnqs,Good.,0
4,5,flgxp,c1gtnuc,Fry that bitch!,0


In [42]:
# Save model
import pickle
with open("Polarisation_Model.pkl", mode="wb") as f:
    pickle.dump(em_pipe, f) 

In [43]:
# Save unlabelled posts to .csv
unlabelled_comments.to_csv("model_labelled_comments.csv")