# 1.Introduction

This projects is coming as a subproject for Kaggle's competetion: `Two Sigma: Using News to Predict Stock Movements`.
Basically its task is to predict if a given information/news brings positive or negative connotations.

This model may be/will be used for predicting how a given information coming from News data may affect a company's stock prices. 

# 2.Modules

## 2.1 NLTK modules for language processing 

In [155]:
import numpy as np
import pandas as pd
import random
import nltk
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.tokenize import word_tokenize, sent_tokenize, PunktSentenceTokenizer
from nltk.corpus import state_union, wordnet, stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer #Stemming words e.g Writing -> Write
"""Models for training and testing the results"""
from sklearn.naive_bayes import GaussianNB

## 2.2 Downloading NLTK submodules

In [190]:
nltk.download('stopwords')
nltk.download('movie_reviews')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\JA\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 3. Deep text preprocessing

## 3.1 Uploading dataset from file

In [195]:
df = pd.read_csv('./data/data.tsv', delimiter = '\t')
df.head(5)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## 3.2 SentimentAnalyst class doing preprocessing step

In [205]:
class SentimentAnalyst:
    def __init__(self, df, feature_col, label_col):
        self.data = df
        self.corpora = None
        self.feature_col = feature_col
        self.label_col = label_col
    """Remove unnecessary symbols & lemmatization, stemming"""
    def preprocess_data(self):
        n_sentences = len(self.data)
        corpora = []
        ps = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        for review_it in range(0, n_sentences):
            review = re.sub('[^a-zA-Z]', ' ', dataframe[self.feature_col][review_it])
            review = review.lower()
            review = review.split()
            stemmed = []
            for word in review:
                if not word in stopwords:
                    word = lemmatizer.lemmatize(word)
                    stemmed.append(ps.stem(word))   
            review = stemmed
            review = ' '.join(review)
            corpora.append(review)
        self.corpora = corpora
        return corpora
    """Build vectors from words Bag of Words"""
    
    def vectorize(self, max_features):
        cv = CountVectorizer(max_features = max_features)
        X = cv.fit_transform(self.corpora).toarray()
        Y = df[self.label_col].values
        return X, Y

## 3.3 Analyst object

In [206]:
analyst = SentimentAnalyst(df, 'Review', "Liked")
analyst.preprocess_data()
X, Y = analyst.vectorize(600)

# 3.4 Splitting into train/test

In [216]:
X_train, X_test, Y_train, Y_test  = train_test_split(X, Y, test_size = 0.15, random_state = 42)

# 4. Building Deep Learning model (Artificial Neural Network)

In [212]:
y_pred

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1], dtype=int64)

In [213]:
Y_test

array([1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0], dtype=int64)

In [214]:

cm = confusion_matrix(Y_test, y_pred)