In [14]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
import ast
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from statistics import mode

In [2]:
annotated_texts_df = pd.read_csv("data/annotated_dataset/annotated_texts_repr.csv", sep=",", encoding="utf-8")

In [3]:
annotated_texts_df.head(1)

Unnamed: 0,id,text,pop_sum,manichean,peoplecentrism,antielitism,emotional,polarization,tfidf,doc_embedding,doc_embedding_pos,linguistic_profile
0,ParlaMint-IT_2013-08-01-LEG17-Senato-sed-86.u153,"PETROCELLI . Signor Presidente, senatrici e se...",4,1,1,1,1,1,"[0.5361957907801886, 0.049413195954373046, 0.0...","[0.009776607354980394, 0.04375904489842546, -0...","[0.0025272382080579183, 0.002842237250819832, ...","[47.0, 1831.0, 38.95744680851064, 4.6773997569..."


In [4]:
x_docembedding = annotated_texts_df["doc_embedding"]
x_docembedding = np.array([ast.literal_eval(item) for item in x_docembedding])

y_manichean = annotated_texts_df["manichean"]
y_peoplecentrism = annotated_texts_df["peoplecentrism"]
y_antielitism = annotated_texts_df["antielitism"]
y_emotional = annotated_texts_df["emotional"]

y_pop_sum = annotated_texts_df["pop_sum"]

In [16]:
clf_lgbm = LGBMClassifier(random_state=8, verbose=-1)

clf_rf = RandomForestClassifier(n_estimators=150, 
                             criterion='gini', 
                             max_depth=None, 
                             min_samples_split=2, 
                             min_samples_leaf=1, 
                             min_weight_fraction_leaf=0.0, 
                             max_features='sqrt', 
                             random_state=0, 
                             n_jobs=-1)

clf_ada_rf = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=100), n_estimators=100, random_state=0)

clf_svc = SVC(gamma='auto', C=0.1, kernel='rbf', random_state=42)

clf_bag = BaggingClassifier(estimator=None, n_estimators=100, random_state=0)

In [19]:
model_predictions = []
predictions = []
models = [clf_lgbm,clf_rf,clf_ada_rf,clf_svc,clf_bag]
features = [y_manichean, y_peoplecentrism, y_antielitism, y_emotional]

for feature in features:
    for model in models:
        print(model)
        predictions.append(cross_val_predict(model, x_docembedding, feature, cv=5))
    model_predictions.append(predictions)
    predictions = []
        
model_predictions

LGBMClassifier(random_state=8, verbose=-1)
RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=0)
AdaBoostClassifier(estimator=RandomForestClassifier(), n_estimators=100,
                   random_state=0)
SVC(C=0.1, gamma='auto', random_state=42)
BaggingClassifier(n_estimators=100, random_state=0)
LGBMClassifier(random_state=8, verbose=-1)
RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=0)
AdaBoostClassifier(estimator=RandomForestClassifier(), n_estimators=100,
                   random_state=0)
SVC(C=0.1, gamma='auto', random_state=42)
BaggingClassifier(n_estimators=100, random_state=0)
LGBMClassifier(random_state=8, verbose=-1)
RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=0)
AdaBoostClassifier(estimator=RandomForestClassifier(), n_estimators=100,
                   random_state=0)
SVC(C=0.1, gamma='auto', random_state=42)
BaggingClassifier(n_estimators=100, random_state=0)
LGBMClassifier(random_state=8, verbose=-1)
RandomForestClassi

[[array([1, 0, 1, ..., 0, 0, 1], dtype=int64),
  array([1, 0, 1, ..., 0, 0, 1], dtype=int64),
  array([1, 0, 1, ..., 0, 0, 1], dtype=int64),
  array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([1, 0, 1, ..., 0, 0, 1], dtype=int64)],
 [array([1, 0, 1, ..., 1, 0, 1], dtype=int64),
  array([1, 0, 1, ..., 0, 0, 1], dtype=int64),
  array([1, 0, 1, ..., 0, 0, 1], dtype=int64),
  array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([1, 0, 1, ..., 0, 0, 1], dtype=int64)],
 [array([1, 0, 1, ..., 0, 0, 1], dtype=int64),
  array([1, 0, 1, ..., 0, 0, 1], dtype=int64),
  array([1, 0, 1, ..., 0, 0, 1], dtype=int64),
  array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([1, 0, 1, ..., 0, 0, 1], dtype=int64)],
 [array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
  array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
  array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
  array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
  array([1, 1, 1, ..., 1, 1, 1], dtype=int64)]]

In [20]:
mode_list = []
for lista in model_predictions:
    mode_l = [mode(values) for values in zip(*lista)]
    mode_list.append(mode_l)

pop_sum_predictions = [sum(values) for values in zip(*mode_list)]

print(classification_report(y_pop_sum, pop_sum_predictions))

              precision    recall  f1-score   support

           0       0.72      0.69      0.71      3995
           1       0.36      0.57      0.44      2371
           2       0.18      0.15      0.16      1213
           3       0.19      0.12      0.14       991
           4       0.71      0.50      0.59      2270

    accuracy                           0.51     10840
   macro avg       0.43      0.41      0.41     10840
weighted avg       0.53      0.51      0.51     10840



In [29]:
df = pd.DataFrame(model_predictions, columns=["lgbm", "rf", "ada_rf", "svc", "bag"])
df.to_csv("data/classifications/five_models_single_pol.csv", index=False)

In [59]:
len(model_predictions[0])

5

In [61]:
import csv

with open('data/classifications/manichean_preds.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(model_predictions[0])
    
with open('data/classifications/peoplecentrism_preds.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(model_predictions[1])
    
with open('data/classifications/antielitism_preds.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(model_predictions[2])
    
with open('data/classifications/emotional_preds.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(model_predictions[3])

In [12]:
features = [y_manichean, y_peoplecentrism, y_antielitism, y_emotional]
predictions = []
for feature in features:
    print(cross_val_score(LGBMClassifier(random_state=8, verbose=-1), x_docembedding, feature, cv=5).mean())
    predict = cross_val_predict(LGBMClassifier(random_state=8, verbose=-1), x_docembedding, feature, cv=5)
    print(classification_report(predict, feature))
    

0.8170664206642065
              precision    recall  f1-score   support

           0       0.90      0.84      0.87      8040
           1       0.62      0.74      0.68      2800

    accuracy                           0.82     10840
   macro avg       0.76      0.79      0.77     10840
weighted avg       0.83      0.82      0.82     10840

0.8039667896678967
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      8371
           1       0.55      0.71      0.62      2469

    accuracy                           0.80     10840
   macro avg       0.73      0.77      0.75     10840
weighted avg       0.83      0.80      0.81     10840

0.8063653136531366
              precision    recall  f1-score   support

           0       0.88      0.83      0.85      7310
           1       0.68      0.75      0.72      3530

    accuracy                           0.81     10840
   macro avg       0.78      0.79      0.79     10840
weighted avg     