In [37]:
import numpy as np
import pandas as pd 
import string
import re
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from numpy import asarray
from numpy import save
from numpy import load
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package punkt to /Users/neeraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [42]:
# Load the X_train, Y_train, X_test and Y_test
train_data = pd.read_csv('../data/fnc-1/final_train.csv')
test_data = pd.read_csv('../data/fnc-1/final_test.csv')
X_train = np.load('../data/fnc-1/x_train.npy')
Y_train = np.load('../data/fnc-1/y_train.npy', allow_pickle=True)
# Converting the labels into int type to prevent unknown type error 
Y_train = Y_train.astype('int')

In [13]:
# Clean data, remove punctuations and all 
# Clean the datasets 
def clean_data(text):
    # Remove whitespaces
    text = text.strip()
    # Remove special characters and numbers
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    # Remove punctuation 
    text.translate(str.maketrans('', '', string.punctuation))
    # Remove square brackets 
    text = re.sub('\[[^]]*\][.;:!\'?,\"()\[\]] ', '', text)
    # Remove break elements from the text 
    text = re.sub("(<br\s*/><br\s*/>)|(\-)|(\/)", '', text)
    # Converting the text to lowercase 
    text = text.lower()
    return text

# Remove stopwords 
stop_words = set(stopwords.words("english"))
def remove_stopwords(text):
    final_words = [word for word in text if word not in stop_words]
    return " ".join(final_words)

# Stemming to reduce words to their word stem for train data using 
# Porter Stemming or Lancaster Stemming algorithms.
def perform_stemming(text):
    # Using PorterStemmer 
    porter = PorterStemmer()
    porterFinal = [porter.stem(word) for word in text.split()]
    text = " ".join(porterFinal)
    # Using LancasterStemmer 
    # lancaster = LancasterStemmer()
    # lancasterFinal = [lancaster.stem(word) for word in text.split()]
    # text = " ".join(lancasterFinal)
    return text

# Lemmatization to reduce inflectional forms to a common base form
def perform_lemmatization(text):
    # Using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatizerFinal = [lemmatizer.lemmatize(word) for word in text.split()]
    text = " ".join(lemmatizerFinal)
    return text

In [28]:
# Create term document matrix fo the columns, tf-idf
def create_term_document_matrix(df_type, tf, tfidf):
    final_array = list()
    for i, val in df_type.iterrows():
        normalised_articleHeading = val['articleHeading']
        normalised_articleBody = val['articleBody']
        # Transform article heading to document-term matrix for tf
        term_document_matrix_heading_tf = tf.transform([normalised_articleHeading])
        # Return a ndarray such that the new shape should be compatible with the original shape
        term_document_matrix_heading_tf = term_document_matrix_heading_tf.toarray().reshape(1, -1)
        # Transform article body to document-term matrix for tf
        term_document_matrix_body_tf = tf.transform([normalised_articleBody])
        # Return a ndarray such that the new shape should be compatible with the original shape
        term_document_matrix_body_tf = term_document_matrix_body_tf.toarray().reshape(1, -1)
        # Transform article heading to document-term matrix for tf-idf 
        term_document_matrix_heading_tfidf = tfidf.transform([normalised_articleHeading])
        # Return the ndarray for the tf-idf of article headings
        term_document_matrix_heading_tfidf =  term_document_matrix_heading_tfidf.toarray()
        # Transform article body to document-term matrix for tf-idf 
        term_document_matrix_body_tfidf = tfidf.transform([normalised_articleBody])
        # Return the ndarray for the tf-idf of article body
        term_document_matrix_body_tfidf =  term_document_matrix_body_tfidf.toarray()
        # Get the cosine similarity 
        term_document_matrix_cosine_similarity = cosine_similarity(term_document_matrix_heading_tfidf, term_document_matrix_body_tfidf)
        # Transform into the original shape 
        term_document_matrix_cosine_similarity = term_document_matrix_cosine_similarity.reshape(1, -1)
        # Get the final featured vectors 
        featured_vectors = np.squeeze(np.c_[term_document_matrix_heading_tf, term_document_matrix_body_tf, term_document_matrix_cosine_similarity])
#         featured_vectors = np.squeeze(np.c_[term_document_matrix_heading_tfidf, term_document_matrix_body_tfidf, term_document_matrix_cosine_similarity])
        # Append the featured vectors to the final data array 
        final_array.append(featured_vectors)
    # Convert the final array into numpy array 
    final_array = np.array(final_array)
    return final_array 

In [25]:
# Fetch the combined unqiue strings in headings and body
def fetch_final_strings_combined(df_type):
    final_strings_combined = list()
    # Loop over each column and append the values 
    for i, val in enumerate(df_type['articleHeading']):
        if val not in final_strings_combined:
            final_strings_combined.append(val)
    for i, val in enumerate(df_type['articleBody']):
        if val not in final_strings_combined:
            final_strings_combined.append(val)
    # Return the final combined array of unique strings
    return final_strings_combined

In [29]:
# Get the common vocabulary of strings for train data
train_vocabulary = fetch_final_strings_combined(train_data)
# Learn vocabulary training set.
tf = TfidfVectorizer(max_features = 2500, use_idf = False)
count_train_tfvectorizer = tf.fit(train_vocabulary)
# Learn vocabulary and idf from training set.
tfidf = TfidfVectorizer(max_features = 2500, use_idf = True)
count_train_tfidfvectorizer = tfidf.fit(train_vocabulary)

In [30]:
def preprocess(text):
    text = clean_data(text)
    text = word_tokenize(text)
    text = remove_stopwords(text)
    text = perform_stemming(text)
    text = perform_lemmatization(text)
    return text

In [71]:
# Logitsic Regression Classifier 
logisticRegression = LogisticRegression(solver='saga', multi_class='multinomial')
logisticRegression.fit(X_train, Y_train)

def main():
    print('Enter the article heading: ')
    head = input('\n')
    print('Enter the article body')
    body = input('\n')
    head = preprocess(head)
    body = preprocess(body)
    data = [[head, body]] 
    df = pd.DataFrame(data, columns = ['articleHeading', 'articleBody']) 
    X_test = create_term_document_matrix(df, tf, tfidf)
    lr_Y_predictions = logisticRegression.predict(X_test)
    return lr_Y_predictions


In [98]:
final = main()
print('\n\n')
if final[0] == 0:
    print('Agree')
elif final[0] == 1:
    print('Disagree')
elif final[0] == 2:
    print('Discuss')
else:
    print('Unrelated')

Enter the article heading: 

Does the Obama Foundation Own 82% of Mail-In Ballot Printers?
Enter the article body

Hailing it as a major step in changing humanity’s prevailing forms of meat consumption, Singapore on Friday announced governmental approval of the first-ever lab-grilled chicken. “After years of research and testing, consumers will now be able purchase delicious lab-grilled meat,” said Singapore Food Agency spokesperson Kuan Sim, adding that the approval was the first step to replacing chicken grilled in restaurants and the home with lab-grilled chicken that was just as succulent and tasty as th



Unrelated


In [91]:
test_data['articleHeading'][]

'No gender segregation on Saudia'

In [93]:
test_data['articleBody'][404]

"Saudi Arabian Airlines on Sunday dismissed claims made by some local media outlets that the national flag carrier is planning to segregate men and women on its flights.\nDenying that such arrangements are being considered, Abdullah Al-Ajhar, the airline’s spokesman, termed the reports as “false” and “misleading.”\nSpeaking to Arab News, Al-Ajhar asserted that there are no plans to separate passengers based on their gender.\nA few days ago, international media, quoting a local daily, reported that Saudia is planning to segregate according to gender, following complaints of uncomfortable journeys by male relatives of female passengers.\nAccording to the report, some passengers complained that females sitting next to non-related male passengers felt uncomfortable.\nIn the report, the international news organization quoted Abdul Rahman Al-Fahd, airline’s vice president for marketing, as saying that measures would be taken to solve this problem.\nHowever, the official concerned denied havi

In [94]:
test_data['articleStance'][404]

'agree'