Installations

In [None]:
# !pip install nltk
# !pip install pandas
# !pip install gensim
# !pip install contractions

Imports

In [1]:
import nltk
from nltk.util import ngrams
import pandas as pd
import gensim
import contractions
import pickle

Read & View Training Data

In [2]:
df = pd.read_csv("SA Training Data.csv", encoding='cp1252')

In [3]:
df.head()

Unnamed: 0,Stock,News,Sentiment
0,AMZN,Amazon.com Inc. (AMZN): Ken Fisher’s AI-Driven...,Optimistic
1,AMZN,Amazon launches drone delivery in Phoenix. Ama...,Optimistic
2,AMZN,Zacks Investment Ideas feature highlights: Ama...,Optimistic
3,TSLA,Tesla options draw 'euphoric' trading as Trump...,Optimistic
4,TSLA,Why Tesla Stock Keeps Going Up. Tesla (NASDAQ:...,Optimistic


Preprocess Training Data

In [4]:
# Initialize stop word list and lemmatizer
stop_list = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

# Initialize empty list to store the sequence of sentiment labels
labels = []

# Initialize empty list to store the articles where each article is a list of words
corpus = []

# For every row in the data frame
for index, row in df.iterrows():
    
    # Extract the label and the text
    label = row['Sentiment']
    text = row['News']

    # Store the label into the list of labels
    labels.append(label)

    # Tokenize the text into words
    article = nltk.word_tokenize(text)

    # Lowercase conversion
    article = [w.lower() for w in article]

    # Stop word removal
    article = [w for w in article if w not in stop_list]

    # Remove punctuation
    article = [w for w in article if w.isalnum()]

    # Expand contractions
    article = [contractions.fix(w) for w in article]

    # Lemmatization
    article = [lemmatizer.lemmatize(w) for w in article]

    # Create bigrams
    bigrams = [' '.join(w) for w in list(ngrams(article, 2))]
    article.extend(bigrams)

    # Store the preprocessed news article into the corpus
    corpus.append(article)

print('Finished reading news articles.')

Finished reading news articles.


Create a Dictionary & Vectorize News Articles

In [5]:
# Create a dictionary from the corpus
dictionary = gensim.corpora.Dictionary(corpus)

# Store the labelled training data in the following list
labelled_training_data = []

# Go through the two lists in parallel to create a labelled dataset
for (l, a) in zip(labels, corpus):

    # Convert the original article into a vector
    vector = dictionary.doc2bow(a)

    # Create a dictionary object to store the document vector for later use in NLTK's classifier
    article_as_dict = {id: 1 for (id, tf) in vector}

    # Add the labelled articles to the labelled dataset
    labelled_training_data.append((article_as_dict, l))

print('Finished preparing the training data.')

Finished preparing the training data.


Train Maximum Entrophy Classifier Model

In [6]:
# Set number of iterations and define the classifier
numIterations = 100
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]

# Train the classifier
entclassifier = nltk.MaxentClassifier.train(labelled_training_data, algorithm, max_iter=numIterations)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.330
             2          -1.09026        1.000
             3          -1.08202        1.000
             4          -1.07388        1.000
             5          -1.06585        1.000
             6          -1.05792        1.000
             7          -1.05009        1.000
             8          -1.04236        1.000
             9          -1.03474        1.000
            10          -1.02721        1.000
            11          -1.01978        1.000
            12          -1.01244        1.000
            13          -1.00520        1.000
            14          -0.99806        1.000
            15          -0.99100        1.000
            16          -0.98404        1.000
            17          -0.97717        1.000
            18          -0.97038        1.000
            19          -0.96369        1.000
 

Save the Gensim Training Data Dictionary

In [7]:
# Save the Gensim Dictionary using the pickle module
with open('dictionary.pkl', 'wb') as f:
    pickle.dump(dictionary, f)

Load the Gensim Training Data Dictionary

In [8]:
# Load the saved Gensim Dictionary
with open('dictionary.pkl', 'rb') as f:
    dictionary_load = pickle.load(f)

Save the Maximum Entrophy Classifier Model

In [9]:
# Save the model using the pickle module
with open('maxent_sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(entclassifier, f)

Load the Classifier Model

In [10]:
# Load the saved classifier model
with open('maxent_sentiment_classifier.pkl', 'rb') as f:
    classifier_load = pickle.load(f)

Predict Test News Articles

In [11]:
# Define a function to predict the sentiment of a new unseen article
def predict_sentiment(text):

    # Initialize stop word list and lemmatizer
    stop_list = nltk.corpus.stopwords.words('english')
    lemmatizer = nltk.stem.WordNetLemmatizer()

    # Tokenize the text into words
    article = nltk.word_tokenize(text)

    # Lowercase conversion
    article = [w.lower() for w in article]

    # Stop word removal
    article = [w for w in article if w not in stop_list]

    # Remove punctuation
    article = [w for w in article if w.isalnum()]

    # Expand contractions
    article = [contractions.fix(w) for w in article]

    # Lemmatization
    article = [lemmatizer.lemmatize(w) for w in article]

    # Create bigrams
    bigrams = [' '.join(w) for w in list(ngrams(article, 2))]
    article.extend(bigrams)

    # Convert the original article into a vector
    vector = dictionary_load.doc2bow(article)

    # Create a dictionary object to store the document vector for later use in NLTK's classifier
    article_as_dict = {id: 1 for (id, tf) in vector}

    # Predict the sentiment of the news article
    return classifier_load.classify(article_as_dict)

In [None]:
# Input test sample to predict sentiment of news article
text = input()
print(predict_sentiment(text))