## Part A Text preprocessing

In [2]:
import json
import re
import nltk
import time
import math
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
content = []
lem = []
story_list = []
trainToken = []
testToken = []
wholeToken = []
start_time = time.time()

# define a pos tag function
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
#read the raw file and put every content in a list named 'content' after lowercasing
with open('signal-news1.jsonl','r') as f:
    for line in f.readlines():
        d = json.loads(line)
        content.append(d['content'].lower())

for i in range(len(content)):
    #a.1 Remove URLs e.g."http://www.", "https://www.",""
    content[i] = re.sub(r'https?://[^\s]+', '', content[i])
    #a.1 Remove all non-alphanumeric characters except spaces
    content[i] = re.sub(r'[^a-z0-9 ]+', '', content[i])
    #a.1 Remove words with only 1 character.
    content[i] = re.sub(r'\b[a-z]\b', '',content[i])
    #a.1 Remove numbers that are fully made of digits
    content[i] = re.sub(r'\b[0-9]+\b', '',content[i])
    #a.2 Convert to tokens
    #tokens.append(nltk.word_tokenize(content[i]))   
    tokens = nltk.word_tokenize(content[i])
    # tokens of first 16000 rows 
    if i < 16000:
        for j in range(len(tokens)):
            trainToken.append(tokens[j])
    # tokens of rest rows
    if i >= 16000:
        for j in range(len(tokens)):
            testToken.append(tokens[j])
    # tokens of whole rows
    for j in range(len(tokens)):
        wholeToken.append(tokens[j])
    #a.2 Lemmatization and pos tagging
    tag = nltk.pos_tag(tokens)
    wnl = nltk.WordNetLemmatizer()
    story = []
    story_dict = {}
    for j in tag:
        lemma = wnl.lemmatize(j[0],get_wordnet_pos(j[1]))
        lem.append(lemma)
        story.append(lemma)
    story_dict['content'] = story
    story_list.append(story_dict)

## Part B N-grams

In [3]:
#b.1 Compute N (number of tokens) and V (vocabulary size).
# number of tokens
N = len(lem)
print("N = ",N)
# vocabulary size
V = len(set(lem))
print("V = ",V)

#b.2 List the top 25 trigrams based on the number of occurrences on the entire corpus.
#list of trigrams
trigrams = list(nltk.trigrams(lem))
# the frequency of trigrams
freq_trigrams = nltk.FreqDist(trigrams)
#top 25 trigrams
print("top 25 trigrams :",freq_trigrams.most_common(25))

#b.3 freqency of positive & negtive words
#open positive words file
positive_list = []
with open('positive-words.txt','r') as positive_file:
    for line in positive_file:
        line=line.strip('\n')
        positive_list.append(line)
positive_set = set(positive_list)
#open negative words file
negative_list = []
with open('negative-words.txt','r') as negative_file:
    for line in negative_file:
        line=line.strip('\n')
        negative_list.append(line)
negative_set = set(negative_list)
# count positive & negative words in all 'content'
n_positive = 0
n_negative = 0
for word in lem:
    if word in positive_set:
        n_positive += 1
    if word in negative_set:
        n_negative += 1
print("the number of positive words:",n_positive)
print("the number of negative words:",n_negative)    

#b.4 positive story OR negative story
n_posiStory = 0
n_negStory = 0
for i in range(len(story_list)):
    n_positive = 0
    n_negative = 0
    for word in story_list[i]['content']:
        # count the positive words in one content
        if word in positive_set:
            n_positive += 1
        # count the negative words in one content
        if word in negative_set:
            n_negative += 1
    #count the number of positive story and negative story 
    if n_positive > n_negative:
        n_posiStory += 1
    if n_positive < n_negative: 
        n_negStory += 1
print("the number of news stories with more positive than negative words:",n_posiStory)
print("the number of news stories with more negative than positive words",n_negStory)     

N =  5690185
V =  128765
top 25 trigrams : [(('one', 'of', 'the'), 2428), (('on', 'share', 'of'), 2095), (('on', 'the', 'stock'), 1566), (('as', 'well', 'a'), 1415), (('in', 'research', 'report'), 1415), (('in', 'research', 'note'), 1373), (('be', 'able', 'to'), 1267), (('for', 'the', 'quarter'), 1221), (('the', 'united', 'state'), 1216), (('average', 'price', 'of'), 1193), (('research', 'report', 'on'), 1177), (('research', 'note', 'on'), 1138), (('the', 'end', 'of'), 1134), (('share', 'of', 'the'), 1133), (('in', 'report', 'on'), 1124), (('earnings', 'per', 'share'), 1119), (('cell', 'phone', 'plan'), 1073), (('phone', 'plan', 'detail'), 1070), (('accord', 'to', 'the'), 1046), (('buy', 'rating', 'to'), 1016), (('of', 'the', 'company'), 1002), (('appear', 'first', 'on'), 994), (('day', 'move', 'average'), 993), (('price', 'target', 'on'), 981), (('be', 'one', 'of'), 969)]
the number of positive words: 176067
the number of negative words: 142801
the number of news stories with more pos

## Part C Language models

In [4]:
#c.1 Compute language models for trigrams (first16000)
#trigrams in training set
train_trigrams = list(nltk.trigrams(trainToken))
#bigram in training set
train_bigrams = list(nltk.bigrams(trainToken))
example_list = ['is','this']
#produce a 10-word sentence beginning with'is' 'this'
for j in range(8):
    triwordlist = []
    #find all trigrams beginning with specified bigram
    for i in range(len(train_trigrams)):
        if train_bigrams[i][0] == example_list[j] and train_trigrams[i][1] == example_list[j+1]:
            triwordlist.append(train_trigrams[i])
    #freqency of trigrams beginning with specified bigram
    freq_exampleTrigrams = nltk.FreqDist(triwordlist)
    #find the highest freqency of trigrams beginning with specified bigram
    nextTrigrams = freq_exampleTrigrams.max()
    #add the next word in the sentence
    nextword = nextTrigrams[2]
    example_list.append(nextword)
print(example_list)


#c.2 Compute the perplexity by evaluating on the remaining rows of the corpus (rows 16,001+).
#frequency dictionary of trigrams in training set
train_trigrams_dict = dict(nltk.FreqDist(train_trigrams))
#frequency dictionary of bigrams in training set
train_bigrams_dict = dict(nltk.FreqDist(train_bigrams))
#trigrams in testing set
test_trigrams = list(nltk.trigrams(testToken))
#bigram in testing set
test_bigrams = list(nltk.bigrams(testToken))    
#bigram in whole corpus
whole_bigrams = list(nltk.bigrams(wholeToken)) 
#the size of whole corpus bigrams
V_bi = len(set(whole_bigrams))
#number of trigrams
N_tri = len(test_trigrams)
sum_logp = 0 
for i in range(len(test_trigrams)):
    if test_trigrams[i] in train_trigrams_dict:
        c_tri = train_trigrams_dict[test_trigrams[i]]
    else:
        c_tri = 0
    if test_trigrams[i][:2] in train_bigrams_dict:
        c_bi = train_bigrams_dict[test_trigrams[i][:2]]
    else:
        c_bi = 0
    # Laplace Smoothing add-one estimate
    p = (c_tri + 1) / (c_bi + V_bi)
    logp = math.log(p)
    sum_logp += logp
# compute the perplexity 
ppw = math.exp(-(sum_logp / (N_tri)))
print("ppw =",ppw)
# running time
end_time = time.time()    
print("time = ",end_time - start_time)

['is', 'this', 'the', 'company', 'has', 'market', 'capitalization', 'of', 'billion', 'and']
ppw = 661302.4545612346
time =  314.2469050884247
