In [18]:
import pandas as pd
from lightgbm import LGBMClassifier
import numpy as np
import ast
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier

In [19]:
annotated_texts_df = pd.read_csv("data/annotated_dataset/annotated_texts_repr.csv", sep=",", encoding="utf-8")

x_tfidf = annotated_texts_df["tfidf"]
x_tfidf = np.array([ast.literal_eval(item) for item in x_tfidf])

x_docembedding = annotated_texts_df["doc_embedding"]
x_docembedding = np.array([ast.literal_eval(item) for item in x_docembedding])

x_docembedding_pos = annotated_texts_df["doc_embedding_pos"]
x_docembedding_pos = np.array([ast.literal_eval(item) for item in x_docembedding_pos])

x_linguistic_profile = annotated_texts_df["linguistic_profile"]
x_linguistic_profile = np.array([ast.literal_eval(item) for item in x_linguistic_profile])

y_manichean = annotated_texts_df["manichean"]
y_peoplecentrism = annotated_texts_df["peoplecentrism"]
y_antielitism = annotated_texts_df["antielitism"]
y_emotional = annotated_texts_df["emotional"]

y_pop_sum = annotated_texts_df["pop_sum"]

In [20]:
clf_lgbm = LGBMClassifier(random_state=8, verbose=-1)

### Multi feature classification

In [21]:
features = [y_manichean, y_peoplecentrism, y_antielitism, y_emotional]
predictions = []

for feature in features:
    predictions.append(cross_val_predict(clf_lgbm, x_docembedding, feature, cv=5))

predictions

[array([1, 0, 1, ..., 0, 0, 1], dtype=int64),
 array([1, 0, 1, ..., 1, 0, 1], dtype=int64),
 array([1, 0, 1, ..., 0, 0, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64)]

In [22]:
pop_sum_predictions = [sum(values) for values in zip(*predictions)]

In [23]:
pop_sum_coarse = [0 if x < 2 else 1 for x in pop_sum_predictions]

In [24]:
y_pop_sum_coarse = [0 if x < 2 else 1 for x in y_pop_sum]

In [25]:
limit = 2
pop_sum_coarse = [0 if x < limit else 1 for x in pop_sum_predictions]
y_pop_sum_coarse = [0 if x < limit else 1 for x in y_pop_sum]
print(classification_report(pop_sum_coarse, y_pop_sum_coarse))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83      6882
           1       0.69      0.78      0.73      3958

    accuracy                           0.79     10840
   macro avg       0.78      0.79      0.78     10840
weighted avg       0.80      0.79      0.79     10840



### Single feature classification

In [26]:
limit = 2
y_pop_sum_coarse = [0 if x < limit else 1 for x in y_pop_sum]

In [27]:
predictions = cross_val_predict(clf_lgbm, x_docembedding, y_pop_sum_coarse, cv=5)
print(classification_report(y_pop_sum_coarse, predictions))

              precision    recall  f1-score   support

           0       0.81      0.84      0.82      6366
           1       0.76      0.71      0.73      4474

    accuracy                           0.79     10840
   macro avg       0.78      0.78      0.78     10840
weighted avg       0.79      0.79      0.79     10840



## Ricerca della migliore combinazione modello - rappresentazione (tfidf - linearsvc)

### lightgbm

In [28]:
reprs = [x_tfidf, x_docembedding, x_docembedding_pos, x_linguistic_profile]
for rep in reprs:
    predictions = cross_val_predict(clf_lgbm, rep, y_pop_sum_coarse, cv=5)
    print(classification_report(y_pop_sum_coarse, predictions))

              precision    recall  f1-score   support

           0       0.81      0.86      0.83      6366
           1       0.78      0.71      0.74      4474

    accuracy                           0.80     10840
   macro avg       0.79      0.78      0.79     10840
weighted avg       0.80      0.80      0.80     10840

              precision    recall  f1-score   support

           0       0.81      0.84      0.82      6366
           1       0.76      0.71      0.73      4474

    accuracy                           0.79     10840
   macro avg       0.78      0.78      0.78     10840
weighted avg       0.79      0.79      0.79     10840

              precision    recall  f1-score   support

           0       0.80      0.84      0.82      6366
           1       0.76      0.70      0.73      4474

    accuracy                           0.78     10840
   macro avg       0.78      0.77      0.77     10840
weighted avg       0.78      0.78      0.78     10840

              preci

### rf

In [29]:
clf_rf = RandomForestClassifier(n_estimators=150, 
                             criterion='gini', 
                             max_depth=None, 
                             min_samples_split=2, 
                             min_samples_leaf=1, 
                             min_weight_fraction_leaf=0.0, 
                             max_features='sqrt', 
                             random_state=0, 
                             n_jobs=-1)

In [30]:
reprs = [x_tfidf, x_docembedding, x_docembedding_pos, x_linguistic_profile]
for rep in reprs:
    predictions = cross_val_predict(clf_rf, rep, y_pop_sum_coarse, cv=5)
    print(classification_report(y_pop_sum_coarse, predictions))

              precision    recall  f1-score   support

           0       0.72      0.93      0.81      6366
           1       0.84      0.49      0.62      4474

    accuracy                           0.75     10840
   macro avg       0.78      0.71      0.71     10840
weighted avg       0.77      0.75      0.73     10840

              precision    recall  f1-score   support

           0       0.79      0.86      0.82      6366
           1       0.77      0.67      0.71      4474

    accuracy                           0.78     10840
   macro avg       0.78      0.76      0.77     10840
weighted avg       0.78      0.78      0.78     10840

              precision    recall  f1-score   support

           0       0.79      0.86      0.82      6366
           1       0.77      0.67      0.71      4474

    accuracy                           0.78     10840
   macro avg       0.78      0.76      0.77     10840
weighted avg       0.78      0.78      0.78     10840

              preci

### svc

In [33]:
clf_linear_svc = LinearSVC(C=1.0, random_state=42)

In [34]:
reprs = [x_tfidf, x_docembedding, x_docembedding_pos, x_linguistic_profile]
for rep in reprs:
    predictions = cross_val_predict(clf_linear_svc, rep, y_pop_sum_coarse, cv=5)
    print(classification_report(y_pop_sum_coarse, predictions))



              precision    recall  f1-score   support

           0       0.84      0.86      0.85      6366
           1       0.80      0.76      0.78      4474

    accuracy                           0.82     10840
   macro avg       0.82      0.81      0.81     10840
weighted avg       0.82      0.82      0.82     10840





              precision    recall  f1-score   support

           0       0.79      0.85      0.82      6366
           1       0.76      0.68      0.72      4474

    accuracy                           0.78     10840
   macro avg       0.77      0.76      0.77     10840
weighted avg       0.78      0.78      0.78     10840





              precision    recall  f1-score   support

           0       0.80      0.85      0.83      6366
           1       0.77      0.70      0.73      4474

    accuracy                           0.79     10840
   macro avg       0.78      0.78      0.78     10840
weighted avg       0.79      0.79      0.79     10840





              precision    recall  f1-score   support

           0       0.67      0.93      0.78      6366
           1       0.78      0.33      0.47      4474

    accuracy                           0.69     10840
   macro avg       0.72      0.63      0.62     10840
weighted avg       0.71      0.69      0.65     10840





### adaboost

In [36]:
clf_ada_rf = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=100), n_estimators=100, random_state=0)

In [37]:
reprs = [x_tfidf, x_docembedding, x_docembedding_pos, x_linguistic_profile]
for rep in reprs:
    predictions = cross_val_predict(clf_ada_rf, rep, y_pop_sum_coarse, cv=5)
    print(classification_report(y_pop_sum_coarse, predictions))

              precision    recall  f1-score   support

           0       0.72      0.93      0.81      6366
           1       0.83      0.48      0.61      4474

    accuracy                           0.75     10840
   macro avg       0.78      0.71      0.71     10840
weighted avg       0.77      0.75      0.73     10840

              precision    recall  f1-score   support

           0       0.78      0.86      0.82      6366
           1       0.76      0.66      0.71      4474

    accuracy                           0.78     10840
   macro avg       0.77      0.76      0.76     10840
weighted avg       0.78      0.78      0.77     10840

              precision    recall  f1-score   support

           0       0.79      0.86      0.82      6366
           1       0.77      0.67      0.71      4474

    accuracy                           0.78     10840
   macro avg       0.78      0.76      0.77     10840
weighted avg       0.78      0.78      0.78     10840

              preci