In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import shuffle, resample

## Load data 

In [2]:
news = pd.read_csv("/home/jupyter-ozkan_ma/data/CSV/news_preprocessed_with_addtionalLabel.csv", index_col=0)

In [3]:
# Get the same train and test data
def split_df_in_train_test(df):
    df = df.reset_index()
    split_point = int(np.round(df.shape[0]) * 0.8)
    df_train = df.loc[:split_point-1,:]
    df_test = df.loc[split_point:,:]
    return df_train, df_test

In [4]:
left_FE = resample(shuffle(news[(news["Label"]=="Left") & (news["Length"]<512)], random_state=42), \
         random_state=42, n_samples=15000)
leanLeft_FE = resample(shuffle(news[(news["Label"]=="Lean Left") & (news["Length"]<512)], random_state=42), \
         random_state=42, n_samples=15000)
center_FE = resample(shuffle(news[(news["Label"]=="Center") & (news["Length"]<512)], random_state=42), \
         random_state=42, n_samples=15000)
leanRight_FE = resample(shuffle(news[(news["Label"]=="Lean Right") & (news["Length"]<512)], random_state=42), \
         random_state=42, n_samples=15000)
right_FE = resample(shuffle(news[(news["Label"]=="Right") & (news["Length"]<512)], random_state=42), \
         random_state=42, n_samples=15000)

In [5]:
train = pd.concat([split_df_in_train_test(left_FE)[0], \
    split_df_in_train_test(leanLeft_FE)[0], \
    split_df_in_train_test(center_FE)[0], \
    split_df_in_train_test(leanRight_FE)[0], \
    split_df_in_train_test(right_FE)[0]])

In [6]:
test =  pd.concat([split_df_in_train_test(left_FE)[1], \
    split_df_in_train_test(leanLeft_FE)[1], \
    split_df_in_train_test(center_FE)[1], \
    split_df_in_train_test(leanRight_FE)[1], \
    split_df_in_train_test(right_FE)[1]])

In [7]:
X_train, y_train = train["pre_content_str"], train["Label"]
X_test, y_test = test["pre_content_str"], test["Label"]

## Generate TFIDF vector

### Bigram:

In [13]:
bigram_vec = TfidfVectorizer(stop_words="english", max_features=30000, ngram_range=(1, 2))

X_train_bi = bigram_vec.fit_transform(X_train.apply(lambda x: np.str_(x)))
X_test_bi = bigram_vec.transform(X_test.apply(lambda x: np.str_(x)))

### Trigram

In [14]:
trigram_vec = TfidfVectorizer(stop_words="english", max_features=30000, ngram_range=(1, 3))

X_train_tri = trigram_vec.fit_transform(X_train.apply(lambda x: np.str_(x)))
X_test_tri = trigram_vec.transform(X_test.apply(lambda x: np.str_(x)))

## Generate LabelEncoder

In [15]:
label_enc = LabelEncoder()
y_train_enc = label_enc.fit_transform(y_train)
y_test_enc = label_enc.fit_transform(y_test)

In [16]:
label_enc.inverse_transform([0, 1, 2, 3, 4]) 

array(['Center', 'Lean Left', 'Lean Right', 'Left', 'Right'], dtype=object)

In [17]:
label = [0, 1, 2, 3, 4]
target_label = ["Center", "Lean Left", "Lean Right", "Left", "Right"]

## Apply classifiers

In [18]:
def run_classifier(clf, X_train, X_test, y_train, y_test, label, target_label):
    
    print("Training of the classifier: {} \n".format(clf))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("\n")

    print("Accuracy of the classifier:     ")
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)

    print("\n")

    print("Confusion Matrix of the classifier: \n")
    con_mat = confusion_matrix(y_test, y_pred, labels=label)
    print(con_mat)

    print("\n")

    print("Classification Report of the classifier: \n")
    report = classification_report(y_test, y_pred, target_names=target_label)
    print(report)

In [24]:
dt = DecisionTreeClassifier(random_state=42)
svc = LinearSVC()
lr = LogisticRegression(multi_class="multinomial", solver="saga")
nb = BernoulliNB()

### ...using with bigrams

#### Decision Tree

In [20]:
run_classifier(dt, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best') 



Accuracy of the classifier:     
0.6266


Confusion Matrix of the classifier: 

[[1860  297  273  259  311]
 [ 284 1855  262  265  334]
 [ 265  269 2032  159  275]
 [ 278  262  161 1997  302]
 [ 317  346  345  337 1655]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

      Center       0.62      0.62      0.62      3000
   Lean Left       0.61      0.62      0.62      3000
  Lean Right       0.66      0.68      0.67      3000
        Left       0.66      0.67      0.66      3000
       Right       0.58     

#### Naive Bayes

In [21]:
run_classifier(nb, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) 



Accuracy of the classifier:     
0.5291333333333333


Confusion Matrix of the classifier: 

[[ 935  335 1147  454  129]
 [ 177 1317  666  620  220]
 [ 190  189 2215  221  185]
 [ 120  209  415 2069  187]
 [ 145  270  668  516 1401]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

      Center       0.60      0.31      0.41      3000
   Lean Left       0.57      0.44      0.50      3000
  Lean Right       0.43      0.74      0.55      3000
        Left       0.53      0.69      0.60      3000
       Right       0.66      0.47      0.55      3000

    accuracy                           0.53     15000
   macro avg       0.56      0.53      0.52     15000
weighted avg       0.56      0.53      0.52     15000



#### Support Vector Machine

In [22]:
run_classifier(svc, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) 



Accuracy of the classifier:     
0.7019333333333333


Confusion Matrix of the classifier: 

[[1938  248  374  221  219]
 [ 231 2105  253  210  201]
 [ 267  236 2146  113  238]
 [ 202  164  103 2336  195]
 [ 237  248  239  272 2004]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

      Center       0.67      0.65      0.66      3000
   Lean Left       0.70      0.70      0.70      3000
  Lean Right       0.69      0.72      0.70      3000
        Left       0.74      0.78      0.76      3000
       Right       0.70      0.67      0.68      3000

    accuracy                           0.70     15000
   macro avg       0.70      0.70      0.70     15000
weighted avg       0.70   

#### Logistic Regression

In [25]:
run_classifier(lr, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False) 



Accuracy of the classifier:     
0.6770666666666667


Confusion Matrix of the classifier: 

[[1777  309  392  281  241]
 [ 233 1996  285  263  223]
 [ 281  234 2120  109  256]
 [ 218  170   87 2290  235]
 [ 248  247  253  279 1973]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

      Center       0.64      0.59      0.62      3000
   Lean Left       0.68      0.67      0.67      3000
  Lean Right       0.68      0.71      0.69      3000
        Left       0.71      0.76      0.74      3000
       Right       0.67      0.66      0.67      3000

    accuracy                    

### ...using trigrams

#### Decision Tree

In [26]:
run_classifier(dt, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best') 



Accuracy of the classifier:     
0.624


Confusion Matrix of the classifier: 

[[1857  298  295  257  293]
 [ 289 1816  273  292  330]
 [ 251  250 2051  157  291]
 [ 275  284  181 1989  271]
 [ 305  353  350  345 1647]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

      Center       0.62      0.62      0.62      3000
   Lean Left       0.61      0.61      0.61      3000
  Lean Right       0.65      0.68      0.67      3000
        Left       0.65      0.66      0.66      3000
       Right       0.58      

#### Naive Bayes

In [27]:
run_classifier(nb, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) 



Accuracy of the classifier:     
0.5288666666666667


Confusion Matrix of the classifier: 

[[ 870  323 1227  460  120]
 [ 151 1348  680  627  194]
 [ 157  199 2258  210  176]
 [  99  194  449 2075  183]
 [ 118  261  733  506 1382]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

      Center       0.62      0.29      0.40      3000
   Lean Left       0.58      0.45      0.51      3000
  Lean Right       0.42      0.75      0.54      3000
        Left       0.54      0.69      0.60      3000
       Right       0.67      0.46      0.55      3000

    accuracy                           0.53     15000
   macro avg       0.57      0.53      0.52     15000
weighted avg       0.57      0.53      0.52     15000



#### Support Vector Machine

In [29]:
run_classifier(svc, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) 



Accuracy of the classifier:     
0.7022


Confusion Matrix of the classifier: 

[[1947  235  368  215  235]
 [ 233 2093  255  213  206]
 [ 272  228 2141  115  244]
 [ 198  171   98 2337  196]
 [ 243  233  252  257 2015]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

      Center       0.67      0.65      0.66      3000
   Lean Left       0.71      0.70      0.70      3000
  Lean Right       0.69      0.71      0.70      3000
        Left       0.74      0.78      0.76      3000
       Right       0.70      0.67      0.68      3000

    accuracy                           0.70     15000
   macro avg       0.70      0.70      0.70     15000
weighted avg       0.70      0.70     

#### Logistic Regression

In [30]:
run_classifier(lr, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False) 



Accuracy of the classifier:     
0.6796666666666666


Confusion Matrix of the classifier: 

[[1788  305  391  273  243]
 [ 242 2003  284  262  209]
 [ 279  236 2123  111  251]
 [ 220  167   86 2298  229]
 [ 255  234  253  275 1983]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

      Center       0.64      0.60      0.62      3000
   Lean Left       0.68      0.67      0.67      3000
  Lean Right       0.68      0.71      0.69      3000
        Left       0.71      0.77      0.74      3000
       Right       0.68      0.66      0.67      3000

    accuracy                    