# From JSON to VECTORS

WARNING:
This notebook has lots of dependencies:

pandas
numpy
nltk (plus loaded libraries from within nltk)
sklearn
scipy

In this notebook:

1. We connect data from different sources into 1 DataFrame.


In [1]:
import pandas as pd

## We connect data from different sources into 1 DataFrame.

In [5]:
CNN_df = pd.read_json("DATA/cnn_west_virginia.json")

In [8]:
FN_df = pd.read_json("DATA/foxnews_west_virginia.json")

In [7]:
NYT_df = pd.read_json("DATA/nyt_west_virginia.json")

In [23]:
#Regex for text cleaning
import re

#NLP library
import nltk

#Helper for creating regex 
import string

# Lemmatisation is the algorithmic process of determining the lemma of a word based on its intended meaning.
# Lemmatisation depends on correctly identifying the intended part of speech and meaning
#of a word in a sentence, as well as within the larger context surrounding that sentence
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()

#pattern.en module contains a fast part-of-speech tagger for English (CLiPS)
#from pattern.en import tag

#WordNet is a lexical database for the English language.[1] It groups English words into sets of synonyms called synsets,#
#provides short definitions and usage examples, and records a number of relations among these synonym sets or their members. 
from nltk.corpus import wordnet as wn

import numpy as np

import pandas as pd
from pandas import DataFrame, Series


from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


from sklearn.metrics.pairwise import cosine_similarity

In [26]:
# To delete stop words from the text
from nltk.corpus import stopwords
stopword_list=stopwords.words("english")

#Add additional stop words
stopword_list.extend(['www','mail','edu','athttps'])

#For tokenizing
from nltk.tokenize import sent_tokenize, word_tokenize

#remove special characters, this is recommended
remove_characters=re.compile('[^a-zA-Z ]')

In [47]:
def remove_special_characters(text):
#    text = text.decode('utf-8')
    text=text.strip()
    filtered_sentence=re.sub(remove_characters, r' ', text)
    return filtered_sentence

In [37]:
def lemmatize_text(text):
#pos_tagged_text is lower case and has WordNet tags, ready to lemmatize    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word #if word has a tag lemmatize it and add to the list, otherwise just add the word                    
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [38]:
# Annotate text tokens with POS tags
def pos_tag_text(text):
#Converts Penn Treebank POS tags to WordNet tags    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    #Use pattern library tagging functions (Penn Treebank syntax)
    tagged_text = tag(text)# Result: list of tuples for each sentence
    #In order to use lemmatizer we need to change POS tags to WordNet tags and make all words lowercase
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text

In [39]:
#This function removes stopwords
def remove_stopwords(text):
    tokens=tokenize_text(text)
    filtered_tokens=[token for token in tokens if token not in stopword_list]
    filtered_text=" ".join(filtered_tokens)
    return filtered_text

In [40]:
#This fucntion tokenize words in a sentence
def tokenize_text(text):
    text = text.decode('utf-8')
    tokens=nltk.word_tokenize(text)
    tokens=[token.strip() for token in tokens]
    return tokens

In [41]:
def normalize_abstract(abstracts):
    normalized_abstracts=[]
    for abstract in abstracts:
        normalized_abstract=[]
        #First clean data from any special characters
        text=remove_special_characters(abstract)
        #Split abstract into sentences
        sentences=sent_tokenize(text)
        for text in sentences:
            text=lemmatize_text(text)
            text=remove_stopwords(text)
            normalized_abstract.append(text)
        normalized_abstract_string=" ".join(normalized_abstract)
        normalized_abstracts.append(normalized_abstract_string)
    return normalized_abstracts

In [42]:
def build_feature_matrix(abstracts, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.00, max_df=1.0):

    feature_type = feature_type.lower().strip()  
    
    if feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range,use_idf=True)
    else:
        raise Exception("Wrong feature type entered. Possible values:'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(abstracts).astype(float)

    
    return vectorizer, feature_matrix

In [43]:
articles=CNN_df.articles_text.tolist()

In [48]:
# Step 1: NORMALIZE YOUR DATA
normalized_articles=normalize_abstract(articles)


NameError: name 'tag' is not defined

In [49]:
# Step 2: EXTRACT FEATURES
tfidf_vectorizer, tfidf_matrix=build_feature_matrix(articles, feature_type="tfidf")

In [50]:
#Get the names of the features in the features matrix, so you are aware of what is happening
feature_names=tfidf_vectorizer.get_feature_names()

#Calculate the adjacency matrix
adj_matrix=cosine_similarity(tfidf_matrix, tfidf_matrix)

In [27]:
for i in range(len(CNN_df)):
    CNN_df["articles_text"][i] = ' '.join(CNN_df["articles_text"][i])

In [59]:
adj_matrix[0]

array([1.        , 0.27486731, 0.34715059, 0.        , 0.24461659,
       0.37151796, 0.30174092, 0.31100895, 0.36484546, 0.2595982 ,
       0.30856416, 0.35022805, 0.30135703, 0.27504421, 0.        ,
       0.29009635, 0.30668694, 0.45320878, 0.40032133, 0.21575573,
       0.33199354, 0.        , 0.        , 0.32897428, 0.36737842,
       0.3855259 , 0.5241428 , 0.19727548, 0.        , 0.30863162,
       0.        , 0.3263225 , 0.30820685, 0.17670712, 0.23724675,
       0.3455495 , 0.37473999, 0.25740417, 0.34771167, 0.        ,
       0.25751855, 0.32028396, 0.32244308, 0.33171465, 0.31548316,
       0.38655153, 0.41494959, 0.29730837, 0.36985265, 0.35020284,
       0.31838694, 0.2620655 , 0.26134971, 0.31023315, 0.        ,
       0.35956499, 0.29296872, 0.37052622, 0.28860029, 0.30090425,
       0.        , 0.39314261, 0.30765935, 0.28937816, 0.29965078,
       0.        , 0.24816642, 0.29452681, 0.10313065, 0.34946721,
       0.32523878, 0.3819188 , 0.3565418 , 0.1460513 , 0.41996

In [54]:
#In order to analyze rows of the matrix we need to change it to dense
from scipy.sparse import csr_matrix
matrix_dense=tfidf_matrix.todense()

In [55]:
i=0
for i in range(len(articles)):
    matrix_dense_row=matrix_dense[i]
    A = np.squeeze(np.asarray(matrix_dense_row))
    important_features=[(d, x) for d, x in enumerate(A) if x > 0.10]
    feature_tuples=zip(*important_features)
    feature_lists=map(list,feature_tuples )
    only_tools["feature_list"][i]=feature_lists[0]
    only_tools["feature_scores"][i]=feature_lists[1]
    i=i+1

TypeError: 'map' object is not subscriptable