In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize
# nltk.download('punkt')
import string
from nltk import PorterStemmer as Stemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer

In [2]:
train_path = 'SentimentalLIAR/train_final.csv'
test_path = 'SentimentalLIAR/test_final.csv'
train_df = pd.read_csv(train_path)
train_df = train_df[['label', 'statement', 'anger', 'fear', 'joy', 'disgust', 'sad']]
train_df['label'] = train_df['label'].str.replace('barely-true', 'FALSE')
train_df['label'] = train_df['label'].str.replace('pants-fire', 'FALSE')
train_df['label'] = train_df['label'].str.replace('half-true', 'TRUE')
train_df['label'] = train_df['label'].str.replace('mostly-true', 'TRUE')
train_df['label'] = train_df['label'].str.replace('false', 'FALSE')
train_df['label'] = train_df['label'].str.replace('true', 'TRUE')
# train_label = train_df.pop('label')

test_df = pd.read_csv(test_path)
test_df = test_df[['label', 'statement', 'anger', 'fear', 'joy', 'disgust', 'sad']]
test_df['label'] = test_df['label'].str.replace('barely-true', 'FALSE')
test_df['label'] = test_df['label'].str.replace('pants-fire', 'FALSE')
test_df['label'] = test_df['label'].str.replace('half-true', 'TRUE')
test_df['label'] = test_df['label'].str.replace('mostly-true', 'TRUE')
test_df['label'] = test_df['label'].str.replace('false', 'FALSE')
test_df['label'] = test_df['label'].str.replace('true', 'TRUE')
# test_label = test_df.pop('label')

train_df

Unnamed: 0,label,statement,anger,fear,joy,disgust,sad
0,FALSE,Says the Annies List political group supports ...,0.121137,0.008926,0.026096,0.263479,0.531887
1,TRUE,When did the decline of coal start? It started...,0.095352,0.124566,0.191357,0.016999,0.102045
2,TRUE,"Hillary Clinton agrees with John McCain ""by vo...",0.039559,0.024162,0.500384,0.454228,0.052453
3,FALSE,Health care reform legislation is likely to ma...,0.004804,0.194674,0.375055,0.022509,0.383403
4,TRUE,The economic turnaround started at the end of ...,0.044237,0.215996,0.222402,0.045672,0.274343
...,...,...,...,...,...,...,...
10231,TRUE,There are a larger number of shark attacks in ...,0.354176,0.290966,0.020772,0.176219,0.216899
10232,TRUE,Democrats have now become the party of the [At...,0.189920,0.077568,0.050404,0.196092,0.410885
10233,TRUE,Says an alternative to Social Security that op...,0.119778,0.206043,0.152902,0.095003,0.220693
10234,FALSE,On lifting the U.S. Cuban embargo and allowing...,0.113820,0.066778,0.312947,0.242768,0.084354


In [3]:
def custom_analyzer(text):
    remove_punctuations = set(string.punctuation)
    text = text.lower()
    text = ''.join([t for t in text if t not in remove_punctuations])
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    stemmer = Stemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

tfidf_vectorizer = TfidfVectorizer(analyzer=custom_analyzer)

column_transformer = ColumnTransformer(
    [('tfidf1', tfidf_vectorizer, 'statement'), ],
    remainder='passthrough')

classifier = Pipeline([
    ('vectorizer', column_transformer),\
    ('classifier', SVC())
])

In [4]:
X = train_df[['statement', 'anger', 'fear', 'joy', 'disgust', 'sad']]
Y = train_df['label']
classifier.fit(X, Y)
y_pred = classifier.predict(test_df[['statement', 'anger', 'fear', 'joy', 'disgust', 'sad']])
print(classification_report(test_df['label'], y_pred))

              precision    recall  f1-score   support

       FALSE       0.59      0.39      0.47       553
        TRUE       0.63      0.79      0.70       714

    accuracy                           0.62      1267
   macro avg       0.61      0.59      0.58      1267
weighted avg       0.61      0.62      0.60      1267

