In [7]:
import json 
from string import punctuation
import collections
import numpy as np
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
# !!! MAKE SURE TO USE SVC.decision_function(X), NOT SVC.predict(X) !!!
# (this makes ``continuous-valued'' predictions)
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn import metrics

def read_vector_file(fname):
    """
    Reads and returns a vector from a file.
    
    Parameters
    --------------------
        fname  -- string, filename
        
    Returns
    --------------------
        labels -- numpy array of shape (n,)
                    n is the number of non-blank lines in the text file
    """
    return np.genfromtxt(fname)
def extract_words(input_string):
    """
    Processes the input_string, separating it into "words" based on the presence
    of spaces, and separating punctuation marks into their own words.
    
    Parameters
    --------------------
        input_string -- string of characters
    
    Returns
    --------------------
        words        -- list of lowercase "words"
    """
    
    for c in punctuation :
        input_string = input_string.replace(c, ' ' + c + ' ')
    
    return input_string.lower().split()

def extract_dictionary(article_list):
    """
    Given a filename, reads the text file and builds a dictionary of unique
    words/punctuations.
    
    Parameters
    --------------------
        infile    -- string, filename
    
    Returns
    --------------------
        word_list -- dictionary, (key, value) pairs are (word, index)
    """
    
    word_list = {}

    index = 0
    for article in article_list: # for each tweet 
        for word in extract_words(article): # for each word in the tweet 
            if word not in word_list and (len(word)>2):
                word_list[word] = index # assign the index 
                index += 1
    pass

    return word_list

def extract_feature_vectors(article_list, word_list):
    """
    Produces a bag-of-words representation of a text file specified by the
    filename infile based on the dictionary word_list.
    
    Parameters
    --------------------
        infile         -- string, filename
        word_list      -- dictionary, (key, value) pairs are (word, index)
    
    Returns
    --------------------
        feature_matrix -- numpy array of shape (n,d)
                          boolean (0,1) array indicating word presence in a string
                            n is the number of non-blank lines in the text file
                            d is the number of unique words in the text file
    """
    
    num_articles = len(article_list)
    num_words = len(word_list)
    feature_matrix = np.zeros((num_articles, num_words))
    
# populate feature matrix 
    tweet_num = 0
    for tweet in article_list: # for each tweet 
        for word in extract_words(tweet): # for each word in the tweet 
            if (len(word)>2):
                feature_matrix[tweet_num][word_list[word]] = 1 #  if it's in there set it to 1
        tweet_num += 1
    pass
        
    return feature_matrix

### END FUNCTIONS ###
with open("fifty_articles.json", encoding="utf-8") as read_file1:
    data = json.load(read_file1)
# this file has ALL the blog posts 
with open("blogposts (1).json", encoding="utf-8") as read_file2:
    data_all = json.load(read_file2)
    
## extract labels from text file 
y_train = read_vector_file('article_training_labels_50.txt')

## extract test article list 
articles = [] 
for article in range(len(data['response']['posts'])):
    articles.append(data['response']['posts'][article]['body'])

## extract full article list 
articles_all = [] 
articles_all_success_indices=[] 
for article in range(len(data_all)):
    articles_all.append(data_all[article]['content'])

# extract titles from json file 
titles = [] 
for title in range(len(data['response']['posts'])):
    titles.append(data['response']['posts'][title]['title'])

# extract titles from json file with all the posts
titles_all = [] 
for title in range(len(data_all)):
    titles_all.append(data_all[title]['title'])
   
dictionary = extract_dictionary(articles) 
dictionary_all = extract_dictionary(articles_all)
dictionary_all_titles = extract_dictionary(titles_all)
dictionary_all = dict(dictionary_all, **dictionary_all_titles)


### Show Review, Album Review, Concert Review, Interview, UCLA Radio News, UCLA Radio Sports, UCLA Radio Comedy, Film Review, 



X_train = extract_feature_vectors(articles, dictionary_all)
X_test = extract_feature_vectors(articles_all, dictionary_all)

y_test=OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train).predict(X_test)

#for i in range(len(y_test)):
#    if y_test[i] == 3:
#        print(titles_all[i])


# I want to add a new category for data_all[article]['content'] but replace content with 'topic' 
category_list = ["Invalid Tag","Show Review", "Music Review", "Interview", "Sports", "News","Entertainment"] 


for article in range(len(data_all)):
    my_topic = category_list[int(y_test[article])]
    data_all[article]['topic']=my_topic
    
print(y_train)
print(data_all[4].keys())
#with open('dump2.json', 'w') as f:  # writing JSON object
#    json.dump(data_all, f)


[1. 5. 3. 6. 1. 1. 2. 1. 3. 3. 1. 1. 3. 2. 3. 4. 4. 3. 1. 5. 1. 1. 1. 3.
 1. 3. 2. 5. 1. 1. 1. 3. 3. 5. 3. 1. 2. 1. 2. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1.]
dict_keys(['_id', 'id', 'title', 'content', 'platform', 'tags', 'date', '__v', 'topic'])
