In [269]:
import pandas as pd
import numpy as np
import re
import scipy

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

### STEPS ###
# 1. tokenize text
# 2. remove stopwords
# 3. discard if words not in wordnet
# 4. discard if words first hypernym not available
# 5. create feature vector of word hypernyms
# 6. assign tfidf weights to feature vector
# 7. calculate sentiment sumscores on words
# 8. multiply sumscore vector by tfidf weights vector
# 9. store tfidf feature vector
# 10. store tfidf*sumscore vector


In [232]:
news_df = pd.read_csv('news/MarketWatch-Combined_pn.csv')

### 1. TOKENIZE TEXT ###
### 2. REMOVE STOPWORDS ###
"""
This is where we remove punctuation and whitespace and also stop words
that have low informational content.
"""
# loading stopwords
stop = set(stopwords.words('english'))

def normalize_text(headline):
    # removing stopwords
    arr = [i for i in nltk.word_tokenize(headline.lower()) if i not in stop]
    
    # remove symbols and numbers
    str = " ".join(arr)
    str = re.sub('[^A-Za-z\s]+', '', str)
    str = re.sub("\s\s+", " ", str)
    
    return str

news_df['headline'] = news_df['headline'].apply(normalize_text)
# print(news_df['headline'])

### 3. DISCARD IF NOT IN WORDNET ###
### 4. DISCARED IF HYPERNYM NOT AVAILABLE ###
### 5. CREATE FEATURE VECTOR OF WORD HYPERNYMS ###

# replace words with hypernyms
def hypernym_replace(headline):
    arr = []
    for word in headline.split():
        # check if word exists in wordnet
        if wn.synsets(word) != []:
            # check if word has hypernyms
            if wn.synsets(word)[0].hypernyms() != []:
                arr.append(wn.synsets(word)[0].hypernyms()[0]._name)
    array = " ".join(arr)
    return array

news_df['headline'] = news_df['headline'].apply(hypernym_replace)
# print(news_df['headline'])

### 5. CREATE FEATURE VECTOR OF WORD HYPERNYMS ###
### 6. ASSIGN TFIDF WEIGHTS TO FEATURE VECTOR ###
vectorizer = TfidfVectorizer()
news_df['tfidf'] = vectorizer.fit_transform(news_df['headline'])
print(news_df['headline'][0])

device.n.01 activity.n.01 commercial_enterprise.n.01 investigation.n.02 magnitude.n.01 assets.n.01 organic_process.n.01 direction.n.01 prediction.n.02 pipe.n.04 activity.n.01 commercial_enterprise.n.01 investigation.n.02 magnitude.n.01 assets.n.01 organic_process.n.01 direction.n.01 prediction.n.02 metallic_element.n.01 activity.n.01 instrumentality.n.03 commercial_enterprise.n.01 investigation.n.02 magnitude.n.01 assets.n.01 organic_process.n.01 direction.n.01 prediction.n.02 beverage.n.01 lipid.n.01 dairy_product.n.01 lipid.n.01 activity.n.01 commercial_enterprise.n.01 direction.n.01 assets.n.01 magnitude.n.01 organic_process.n.01 possibility.n.02 prediction.n.02 device.n.01 activity.n.01 commercial_enterprise.n.01 investigation.n.02 magnitude.n.01 assets.n.01 organic_process.n.01 direction.n.01 prediction.n.02 way.n.06 inform.v.01 evaluation.n.02 letter.n.02 people.n.01 physical_phenomenon.n.01 representational_process.n.01 imaging.n.02 instrumentality.n.03 activity.n.01 prediction.

In [233]:
np.set_printoptions(threshold=np.inf)

### 7. CALCULATE SENTIMENT SUMSCORES ON WORDS ###
# create a one-hot encoding feature vector of word sentiments
# sentiment sum score --> sentiment_sum = pos_sentiment_score + neg_sentiment_score

def remove_periods(df):
    arr = []
    for word in df.split():
        the_real = word.split('.', 1)[0]
        arr.append(the_real)
    return " ".join(arr)

test_df = news_df['headline'].apply(remove_periods)
vectorizer = TfidfVectorizer()
news = vectorizer.fit_transform(test_df)

# tfidf * sentiment_sum_score
cx = scipy.sparse.coo_matrix(news)
for i,j,v in zip(cx.row, cx.col, cx.data):
    # find original hypernyms
    regex=re.compile("%s.*" % vectorizer.get_feature_names()[j])
    word = [m.group(0) for l in news_df['headline'][i].split() for m in [regex.search(l)] if m][0]
    
    # get sum sentiment score
    try:
        breakdown = swn.senti_synset(word)
        sum_sentiment = breakdown.pos_score() + breakdown.neg_score()
    except:
        sum_sentiment = 0
    tfidf_score = news[i,j]
    
    # update original csr_matrix
    news[i,j] = tfidf_score  * sum_sentiment

# this is the tfidf * sentiment_sum_score matrix
# print(news)

  (0, 545)	0.0
  (0, 30)	0.0401676632198
  (0, 368)	0.0
  (0, 1002)	0.034244474494
  (0, 1100)	0.0
  (0, 149)	0.0524444131482
  (0, 1287)	0.0
  (0, 558)	0.0
  (0, 1416)	0.0
  (0, 1361)	0.0
  (0, 1154)	0.0
  (0, 983)	0.0
  (0, 213)	0.0154912410027
  (0, 1079)	0.0
  (0, 492)	0.0
  (0, 1405)	0.0109684669532
  (0, 1919)	0.0
  (0, 965)	0.0
  (0, 677)	0.0
  (0, 1057)	0.0
  (0, 1333)	0.0
  (0, 1355)	0.0
  (0, 1541)	0.0
  (0, 922)	0.0
  (0, 1918)	0.0
  :	:
  (203, 869)	0.0
  (204, 30)	0.0111158849423
  (204, 965)	0.0
  (204, 131)	0.0
  (204, 1685)	0.0
  (204, 942)	0.0
  (204, 1153)	0.0
  (204, 6)	0.0
  (204, 1543)	0.0
  (204, 1059)	0.0
  (204, 601)	0.0
  (204, 1326)	0.0
  (204, 41)	0.0
  (204, 434)	0.216747767934
  (204, 819)	0.0
  (204, 1112)	0.0
  (204, 436)	0.0
  (204, 1446)	0.0
  (204, 633)	0.246252724152
  (204, 95)	0.0
  (204, 298)	0.0
  (204, 1041)	0.0
  (204, 96)	0.0
  (204, 464)	0.0
  (204, 891)	0.0


In [234]:
# drop all zero values from the csr matrix
news = news[news.getnnz(1)>0]

In [235]:
# drop all rows in original df if they don't exist in the tfidf*senti matrix

print(news_df.shape)
del_array = []

for index, row in news_df.iterrows():
    try:
        news[index]
    except:
        del_array.append(index)
        
news_df = news_df.drop(news_df.index[del_array])

print(news_df.shape)

(205, 4)
(119, 4)


In [267]:
# print(news_df.shape)
# print(news_df)
# print(news)

news_df['senti_tfidf'] = news
news_df['label'].isnull().sum()

news = news[pd.notnull(news_df['label'])]

(95, 1970)

In [278]:
# split data into test and train
from sklearn.model_selection import train_test_split
from sklearn import svm
import datetime as dt

X_train, X_test, y_train, y_test = train_test_split(news, news_df['label'], test_size=0.15, random_state=42)

# train our model
param_C = 5
param_gamma = 0.05
classifier = svm.SVC(C=param_C,gamma=param_gamma)

start_time = dt.datetime.now()
print('Start learning at {}'.format(str(start_time)))
classifier.fit(X_train, y_train)
end_time = dt.datetime.now()
print('Stop learning {}'.format(str(end_time)))
elapsed_time= end_time - start_time
print('Elapsed learning {}'.format(str(elapsed_time)))

# get some accuracy
expected = y_test
predicted = classifier.predict(X_test)

print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))

cm = metrics.confusion_matrix(expected, predicted)
print("Confusion matrix:\n%s" % cm)

print("Accuracy={}".format(metrics.accuracy_score(expected, predicted)))

Start learning at 2017-04-13 11:08:54.496675
Stop learning 2017-04-13 11:08:54.498748
Elapsed learning 0:00:00.002073
15
Classification report for classifier SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.05, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

          N       0.00      0.00      0.00         9
          P       0.40      1.00      0.57         6

avg / total       0.16      0.40      0.23        15


Confusion matrix:
[[0 9]
 [0 6]]
Accuracy=0.4


  'precision', 'predicted', average, warn_for)
