In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from numpy import asarray
from numpy import save
from numpy import load
from tqdm import tqdm

In [64]:
# Import the datasets for train and test 
train_data = pd.read_csv('../data/fnc-1/preprocess_train.csv')
test_data = pd.read_csv('../data/fnc-1/preprocess_test.csv')

In [65]:
# Create term document matrix fo the columns, tf-idf
def create_term_document_matrix(df_type, tf, tfidf):
    final_array = list()
    for i, val in df_type.iterrows():
        normalised_articleHeading = val['articleHeading']
        normalised_articleBody = val['articleBody']
        # Transform article heading to document-term matrix for tf
        term_document_matrix_heading_tf = tf.transform([normalised_articleHeading])
        # Return a ndarray such that the new shape should be compatible with the original shape
        term_document_matrix_heading_tf = term_document_matrix_heading_tf.toarray().reshape(1, -1)
        # Transform article body to document-term matrix for tf
        term_document_matrix_body_tf = tf.transform([normalised_articleBody])
        # Return a ndarray such that the new shape should be compatible with the original shape
        term_document_matrix_body_tf = term_document_matrix_body_tf.toarray().reshape(1, -1)
        # Transform article heading to document-term matrix for tf-idf 
        term_document_matrix_heading_tfidf = tfidf.transform([normalised_articleHeading])
        # Return the ndarray for the tf-idf of article headings
        term_document_matrix_heading_tfidf =  term_document_matrix_heading_tfidf.toarray()
        # Transform article body to document-term matrix for tf-idf 
        term_document_matrix_body_tfidf = tfidf.transform([normalised_articleBody])
        # Return the ndarray for the tf-idf of article body
        term_document_matrix_body_tfidf =  term_document_matrix_body_tfidf.toarray()
        # Get the cosine similarity 
        term_document_matrix_cosine_similarity = cosine_similarity(term_document_matrix_heading_tfidf, term_document_matrix_body_tfidf)
        # Transform into the original shape 
        term_document_matrix_cosine_similarity = term_document_matrix_cosine_similarity.reshape(1, -1)
        # Get the final featured vectors 
        featured_vectors = np.squeeze(np.c_[term_document_matrix_heading_tf, term_document_matrix_body_tf, term_document_matrix_cosine_similarity])
#         featured_vectors = np.squeeze(np.c_[term_document_matrix_heading_tfidf, term_document_matrix_body_tfidf, term_document_matrix_cosine_similarity])
        # Append the featured vectors to the final data array 
        final_array.append(featured_vectors)
    # Convert the final array into numpy array 
    final_array = np.array(final_array)
    return final_array 

In [66]:
# # Manually calculating the TF-IDF 
# def calculate_term_frequency(words, bow):
#     tf = dict()
#     bowCount = len(bow)
#     for word, count in wordDict.items():
#         tf[word] = count / float(bowCount)
#     return tf

# # Calculating the idf values 
# def calculate_inverse_document_frequency(documents):
#     n = len(documents)
#     idf = dict.fromkeys(documents[0].keys(), 0)
#     for document in documents:
#         for word, val in document.items():
#             if val > 0:
#                 idf[word] += 1    
#     for word, val in idf.items():
#         idf[word] = math.log(n / float(val))    
#     return idf 

# # Calculating the tf-idf values 
# def calculateTfidf(tf, idf):
#     tfidf = dict()
#     for word, val in tf.items():
#         tfidf[word] = val * idf[word]
#     return tfidf

# bagOfWordsA = documentA.split(' ')
# bagOfWordsB = documentB.split(' ')
# uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
# numOfWordsA = dict.fromkeys(uniqueWords, 0)
# for word in bagOfWordsA:
#     numOfWordsA[word] += 1

# tfA = calculate_term_frequency(numOfWordsA, bagOfWordsA)
# tfB = calculate_term_frequency(numOfWordsB, bagOfWordsB)
# idfs = calculate_inverse_document_frequency([numOfWordsA, numOfWordsB])
# tfidfA = calculateTfidf(tfA, idfs)
# tfidfB = calculateTfidf(tfB, idfs)

In [67]:
test_data.head()

Unnamed: 0,bodyId,articleHeading,articleBody,articleStance
0,1,appl instal safe instor protect gold watch edit,alsisi deni isra report state offer extend gaz...,unrelated
1,1,elsisi deni claim hell give sinai land palesti...,alsisi deni isra report state offer extend gaz...,agree
2,1,appl keep gold watch edit special instor safe,alsisi deni isra report state offer extend gaz...,unrelated
3,1,appl store keep gold edit appl watch custom safe,alsisi deni isra report state offer extend gaz...,unrelated
4,1,south korean woman hair eaten robot vacuum cle...,alsisi deni isra report state offer extend gaz...,unrelated


In [68]:
# Convert Stances(categorical) into quantitative values for train data where 
# 0 -> agree
# 1 -> disagree
# 2 -> discuss
# 3 -> unrelated
for i, val in enumerate(train_data['articleStance']):
    if val == "agree":
        train_data['articleStance'][i] = 0
    elif val == "disagree":
        train_data['articleStance'][i] = 1
    elif val == "discuss":
        train_data['articleStance'][i] = 2
    else:
        train_data['articleStance'][i] = 3
        
train_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,bodyId,articleHeading,articleBody,articleStance
0,0,soldier shot parliament lock gunfir erupt war ...,small meteorit crash wood area nicaragua capit...,3
1,0,tourist dub spider man spider burrow skin day,small meteorit crash wood area nicaragua capit...,3
2,0,luke somer kill fail rescu attempt yemen,small meteorit crash wood area nicaragua capit...,3
3,0,break soldier shot war memori ottawa,small meteorit crash wood area nicaragua capit...,3
4,0,giant 8ft 9in catfish weigh 19 stone caught it...,small meteorit crash wood area nicaragua capit...,3


In [69]:
# Convert Stances(categorical) into quantitative values for test data where 
# 0 -> agree
# 1 -> disagree
# 2 -> discuss
# 3 -> unrelated
for i, val in enumerate(test_data['articleStance']):
    if val == "agree":
        test_data['articleStance'][i] = 0
    elif val == "disagree":
        test_data['articleStance'][i] = 1
    elif val == "discuss":
        test_data['articleStance'][i] = 2
    else:
        test_data['articleStance'][i] = 3
        
test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,bodyId,articleHeading,articleBody,articleStance
0,1,appl instal safe instor protect gold watch edit,alsisi deni isra report state offer extend gaz...,3
1,1,elsisi deni claim hell give sinai land palesti...,alsisi deni isra report state offer extend gaz...,0
2,1,appl keep gold watch edit special instor safe,alsisi deni isra report state offer extend gaz...,3
3,1,appl store keep gold edit appl watch custom safe,alsisi deni isra report state offer extend gaz...,3
4,1,south korean woman hair eaten robot vacuum cle...,alsisi deni isra report state offer extend gaz...,3


In [8]:
# Fetch the combined unqiue strings in headings and body
def fetch_final_strings_combined(df_type):
    final_strings_combined = list()
    # Loop over each column and append the values 
    for i, val in enumerate(df_type['articleHeading']):
        if val not in final_strings_combined:
            final_strings_combined.append(val)
    for i, val in enumerate(df_type['articleBody']):
        if val not in final_strings_combined:
            final_strings_combined.append(val)
    # Return the final combined array of unique strings
    return final_strings_combined

In [9]:
# Get the common vocabulary of strings for train data
train_vocabulary = fetch_final_strings_combined(train_data)
# Learn vocabulary training set.
tf = TfidfVectorizer(max_features = 2500, use_idf = False)
count_train_tfvectorizer = tf.fit(train_vocabulary)
# Learn vocabulary and idf from training set.
tfidf = TfidfVectorizer(max_features = 2500, use_idf = True)
count_train_tfidfvectorizer = tfidf.fit(train_vocabulary)
# Get the final term document matrix for X_train 
X_train = create_term_document_matrix(train_data, count_train_tfvectorizer, count_train_tfidfvectorizer)

In [10]:
# Check the shape of the X_train array
X_train.shape

(49972, 5001)

In [11]:
# Get the common vocabulary of strings for test data
# test_vocabulary = fetch_final_strings_combined(test_data)
# Learn vocabulary training set.
# count_test_tfvectorizer = TfidfVectorizer(max_features = 2500, use_idf = False).fit(test_vocabulary)
# Learn vocabulary and idf from training set.
# count_test_tfidfvectorizer = TfidfVectorizer(max_features = 2500, use_idf = True).fit(test_vocabulary)
# Get the final term document matrix for X_train 
X_test = create_term_document_matrix(test_data, tf, tfidf)

In [12]:
X_test.shape

(25413, 5001)

In [12]:
# Save the X_train numpy array 
save('../data/fnc-1/x_train.npy', X_train)
# Save the X_test numpy array 
save('../data/fnc-1/x_test.npy', X_test)

In [13]:
# Loading the numpy arrays
# X_train = load('../data/fnc-1/x_train.npy')
# X_test = load('../data/fnc-1/x_test.npy')

In [14]:
# Get the Y_train value array and save it
Y_train = train_data['articleStance'].values
save('../data/fnc-1/y_train.npy', Y_train, allow_pickle=True)
type(Y_train)

numpy.ndarray

In [15]:
# Get the Y_test value array and save it 
Y_test = test_data['articleStance'].values 
save('../data/fnc-1/y_test.npy', Y_test, allow_pickle=True)
type(Y_test)

numpy.ndarray

In [16]:
from collections import Counter
Counter(train_data['articleStance'])

Counter({3: 36545, 0: 3678, 2: 8909, 1: 840})

In [17]:
Counter(test_data['articleStance'])

Counter({3: 18349, 0: 1903, 2: 4464, 1: 697})

In [40]:
# xx['vector'] = np.array(X_train, dtype=object)

In [70]:
new_df = train_data

In [74]:
new_df['vector'] = [0] * len(new_df)

In [75]:
for i in range(len(train_data)):
    new_df['vector'][i] = np.array(X_train[i], dtype="object")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [81]:
new_disagree_df = new_df[new_df.articleStance == 1]
temp_df = pd.DataFrame().append([new_disagree_df] * 10, ignore_index=True)
temp_df

Unnamed: 0,bodyId,articleHeading,articleBody,articleStance,vector
0,78,batmobil stolen batman v superman dawn justic ...,rumour ridicul pretti amus hard believ given z...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,83,weather report caught write name snow readi go...,there readi go camera there realli realli read...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,124,peni spraypaint 25 million car prank video,might say matter rich spend 25 million car mak...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,176,whoa paul rudd one airport hero took homophob,hunki mensch took violent bulli look like paul...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,218,pope franci turn made pet heaven comment,leader cathol church assur pet lover across wo...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
8395,2515,stori cathol priest die see god woman come bac...,et si dieu tait une femm cest ce quaffirm le p...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8396,2515,ope hoax priest never exist claim die saw fema...,et si dieu tait une femm cest ce quaffirm le p...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8397,2519,batmobil stolen batman v superman dawn justic ...,friday rumor crop one new batmobil vehicl stol...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8398,2521,nasa confirm earth experi 6 day total dark dec...,hoax stori circul far wide twitter facebook ma...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [96]:
for i, row in temp_df.iterrows():
    if i < 3:
        vectors = np.array(row['vector'])
        print(vectors)
        nonzero_index = vectors.nonzero()[0]
        print(nonzero_index)
        random_vec = np.random.randint(-2, 2, len(nonzero_index))
        print(random_vec)
        vectors[nonzero_index] += random_vec
        print(vectors)
        # Taking the absolute valyes to remove the negative ones
        vectors = np.abs(vectors) 
        print(vectors)
        row['vector'] = vectors.tolist()
        print('\n\n\n')
        print(len(vectors.tolist()))
    else:
        break
new_df = pd.concat([new_df, temp_df])


[0.0 0.0 0.0 ... 0.0 0.0 0.5622893760998064]
[ 282  283  643 1255 1284 1995 2051 2125 2170 2541 2584 2591 2622 2637
 2718 2758 2782 2783 2800 2801 2807 2829 2866 2963 2968 3004 3007 3056
 3081 3126 3143 3196 3271 3304 3326 3375 3413 3422 3442 3462 3472 3475
 3494 3501 3514 3518 3562 3576 3586 3632 3645 3755 3805 3853 3950 3969
 3979 3996 4059 4084 4085 4086 4095 4193 4226 4229 4240 4244 4315 4355
 4395 4423 4429 4431 4444 4470 4491 4495 4501 4510 4551 4561 4577 4595
 4613 4625 4631 4638 4670 4745 4746 4773 4848 4855 4869 4872 4879 4926
 4931 5000]
[-2 -2  1 -1 -2 -1 -1 -2 -2  1  0 -1  0 -1  0  1  1  1  0  1 -1 -2 -1 -2
  1 -1  1  1 -1 -2  0  1  1 -1 -2 -1  1 -2  0  0  1  0  0  1 -2 -1 -2 -1
 -2 -2 -1  0  1  1 -2 -2 -2 -2  1  0  1 -2  0  0 -1 -1  1  0  1 -2  0 -1
  0  0 -2 -2 -1 -2  1 -2  1  1  1 -1 -1 -2 -2  0  0  1 -2 -1  0 -1 -1  0
 -2  0  0 -1]
[0.0 0.0 0.0 ... 0.0 0.0 -0.43771062390019355]
[0.0 0.0 0.0 ... 0.0 0.0 0.43771062390019355]




5001
[0.0 0.0 0.0 ... 0.0 0.0 0.32518383187

In [87]:
new_df

Unnamed: 0,bodyId,articleHeading,articleBody,articleStance,vector
0,0,soldier shot parliament lock gunfir erupt war ...,small meteorit crash wood area nicaragua capit...,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,0,tourist dub spider man spider burrow skin day,small meteorit crash wood area nicaragua capit...,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,0,luke somer kill fail rescu attempt yemen,small meteorit crash wood area nicaragua capit...,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,0,break soldier shot war memori ottawa,small meteorit crash wood area nicaragua capit...,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,0,giant 8ft 9in catfish weigh 19 stone caught it...,small meteorit crash wood area nicaragua capit...,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
8395,2515,stori cathol priest die see god woman come bac...,et si dieu tait une femm cest ce quaffirm le p...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8396,2515,ope hoax priest never exist claim die saw fema...,et si dieu tait une femm cest ce quaffirm le p...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8397,2519,batmobil stolen batman v superman dawn justic ...,friday rumor crop one new batmobil vehicl stol...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8398,2521,nasa confirm earth experi 6 day total dark dec...,hoax stori circul far wide twitter facebook ma...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
