In [1]:
import os
parentdir = "/Users/sude_umac/PycharmProjects/NLP2/Propaganda_dataset "
train_file= "propaganda_train.tsv"
test_file= "propaganda_val.tsv"
train_path=os.path.join(parentdir,train_file)
test_path= os.path.join(parentdir,test_file) 

In [2]:
import pandas as pd

train_df = pd.read_csv(train_path,delimiter="\t",quotechar='|')
train_df.columns = ['label', 'sentence']

# Load the testing data
test_df = pd.read_csv(test_path,delimiter="\t",quotechar='|')
test_df.columns = ['label', 'sentence']


In [10]:
# it uses the propaganda_val.tsv file as testing data, and it converts the labels into multi-class format instead of binary format. The LabelEncoder is used to convert the labels into multi-class format, and the propaganda_val.tsv file is loaded into a DataFrame, preprocessed, and used as testing data. The classifier is trained on the training data and evaluated on both the validation data and the testing data.


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


# Preprocess the data
train_df['sentence'] = train_df['sentence'].str.lower().str.replace('[^\w\s]', '')
test_df['sentence'] = test_df['sentence'].str.lower().str.replace('[^\w\s]', '')

def extract_snippet(sentence):
    start_tag = '<bos>'
    end_tag = '<eos>'
    start = sentence.lower().find(start_tag) + len(start_tag)
    end = sentence.lower().find(end_tag)
    snippet = sentence[start:end].strip()  # strip() is used to remove leading and trailing whitespace
    return snippet

# Extract snippets from sentences
train_df['snippet'] = train_df['sentence'].apply(extract_snippet)
test_df['snippet'] = test_df['sentence'].apply(extract_snippet)

# Initialize the LabelEncoder
le = LabelEncoder()

# Fit the LabelEncoder to the labels in the 'label' column and transform the labels
train_df['multi_class_label'] = le.fit_transform(train_df['label'])
test_df['multi_class_label'] = le.transform(test_df['label'])

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df['snippet'], train_df['multi_class_label'], test_size=0.2, random_state=42)

# Create a bag-of-words representation
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

# Train a MultinomialNB classifier
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

# Predict on validation set
y_val_pred = classifier.predict(X_val_vec)

# Evaluate the classifier
print("Validation Accuracy: ", accuracy_score(y_val, y_val_pred))

# Create a bag-of-words representation for the test data
X_test_vec = vectorizer.transform(test_df['snippet'])

# Use the trained classifier to predict on the test set
y_test_pred = classifier.predict(X_test_vec)

# Evaluate the classifier on the test set
print("Test Accuracy: ", accuracy_score(test_df['multi_class_label'], y_test_pred)) #this is the actual test data which tthe model has never seen during training or validation, is used to evaluate the final model's performace 

Validation Accuracy:  0.556640625
Test Accuracy:  0.559375


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Use TF-IDF instead of CountVectorizer
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(test_df['snippet'])

# Use SVC instead of MultinomialNB
classifier = SVC()

# Define the parameter grid for SVC
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'poly', 'sigmoid']}

# Use GridSearchCV to find the best parameters
grid = GridSearchCV(classifier, param_grid, refit=True, verbose=2)
grid.fit(X_train_vec, y_train)

# Print the best parameters
print(grid.best_params_)

# Predict on validation set
y_val_pred = grid.predict(X_val_vec)

# Evaluate the classifier
print("Validation Accuracy: ", accuracy_score(y_val, y_val_pred))

# Use the trained classifier to predict on the test set
y_test_pred = grid.predict(X_test_vec)

# Evaluate the classifier on the test set
print("Test Accuracy: ", accuracy_score(test_df['multi_class_label'], y_test_pred))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.2s
[CV] END .....................C=0.1, gamma=1, k

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Create a pipeline
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

# Create parameter grid
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],  # unigrams, bigrams, trigrams
    'clf__alpha': [0.1, 1, 10, 100],  # different alpha values for MultinomialNB
}

# Create an instance of GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best score and parameters
print("Best score: ", grid_search.best_score_)
print("Best parameters: ", grid_search.best_params_)

# Predict on validation set
y_val_pred = grid_search.predict(X_val)

# Evaluate the classifier
print("Validation Accuracy: ", accuracy_score(y_val, y_val_pred))

# Predict on test set
y_test_pred = grid_search.predict(test_df['snippet'])

# Evaluate the classifier on the test set
print("Test Accuracy: ", accuracy_score(test_df['multi_class_label'], y_test_pred))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score:  0.550296380225416
Best parameters:  {'clf__alpha': 0.1, 'vect__ngram_range': (1, 2)}
Validation Accuracy:  0.5546875
Test Accuracy:  0.5765625
