In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import shuffle, resample

## Load data

In [2]:
news = pd.read_csv("/home/jupyter-ozkan_ma/data/CSV/news_preprocessed_with_addtionalLabel.csv", index_col=0)

In [3]:
# Get the same train and test data
def split_df_in_train_test(df):
    df = df.reset_index()
    split_point = int(np.round(df.shape[0]) * 0.8)
    df_train = df.loc[:split_point-1,:]
    df_test = df.loc[split_point:,:]
    return df_train, df_test

In [4]:
left_AbSt_01 = resample(shuffle(news[(news["Label_AbStudy_01"]=="Left") & (news["Length"]<512)], random_state=42), \
         random_state=42, n_samples=25000)
center_AbSt_01 = resample(shuffle(news[(news["Label_AbStudy_01"]=="Center") & (news["Length"]<512)], random_state=42), \
         random_state=42, n_samples=25000)
right_AbSt_01 = resample(shuffle(news[(news["Label_AbStudy_01"]=="Right") & (news["Length"]<512)], random_state=42), \
         random_state=42, n_samples=25000)

In [5]:
train = pd.concat([split_df_in_train_test(left_AbSt_01)[0], \
    split_df_in_train_test(center_AbSt_01)[0], \
    split_df_in_train_test(right_AbSt_01)[0]])

In [6]:
test = pd.concat([split_df_in_train_test(left_AbSt_01)[1], \
    split_df_in_train_test(center_AbSt_01)[1], \
    split_df_in_train_test(right_AbSt_01)[1]])

In [7]:
X_train, y_train = train["pre_content_str"], train["Label_AbStudy_01"]
X_test, y_test = test["pre_content_str"], test["Label_AbStudy_01"]

## Generate trigram tfidf vector

In [8]:
trigram_vec = TfidfVectorizer(stop_words="english", max_features=30000, ngram_range=(1, 3))

X_train_tri = trigram_vec.fit_transform(X_train.apply(lambda x: np.str_(x)))
X_test_tri = trigram_vec.transform(X_test.apply(lambda x: np.str_(x)))

## Generate Label Encoder

In [9]:
label_enc = LabelEncoder()
y_train_enc = label_enc.fit_transform(y_train)
y_test_enc = label_enc.fit_transform(y_test)

In [12]:
label_enc.inverse_transform([0, 1, 2]) 

array(['Center', 'Left', 'Right'], dtype=object)

In [13]:
label = [0, 1, 2]
target_label = ["Center", "Left", "Right"]

## Define function to run and evaluate classifier

In [14]:
def run_classifier(clf, X_train, X_test, y_train, y_test, label, target_label):
    
    print("Training of the classifier: {} \n".format(clf))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("\n")

    print("Accuracy of the classifier:     ")
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)

    print("\n")

    print("Confusion Matrix of the classifier: \n")
    con_mat = confusion_matrix(y_test, y_pred, labels=label)
    print(con_mat)

    print("\n")

    print("Classification Report of the classifier: \n")
    report = classification_report(y_test, y_pred, target_names=target_label)
    print(report)

## Train a SVM

In [15]:
svc = LinearSVC()

In [16]:
run_classifier(svc, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) 



Accuracy of the classifier:     
0.7419333333333333


Confusion Matrix of the classifier: 

[[3758  629  613]
 [ 552 3752  696]
 [ 706  675 3619]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

      Center       0.75      0.75      0.75      5000
        Left       0.74      0.75      0.75      5000
       Right       0.73      0.72      0.73      5000

    accuracy                           0.74     15000
   macro avg       0.74      0.74      0.74     15000
weighted avg       0.74      0.74      0.74     15000

