In [1]:
import sqlite3
import numpy as np
import pandas as pd
import nltk

from utils import process_article, build_freqs

In [2]:
nltk.download('stopwords')
conn = sqlite3.connect('Database\\bbcdb.sqlite')
cur = conn.cursor()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mtayl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
cur.execute("SELECT content FROM Article WHERE category_id=1") # business category
all_business = cur.fetchmany(350)
all_business = [all_business[i][0] for i in range(len(all_business))]

cur.execute("SELECT content FROM Article WHERE category_id=2") # entertainment category
all_entertainment = cur.fetchmany(350)
all_entertainment = [all_entertainment[i][0] for i in range(len(all_entertainment))]

cur.execute("SELECT content FROM Article WHERE category_id=3") # politics category
all_politics = cur.fetchmany(350)
all_politics = [all_politics[i][0] for i in range(len(all_politics))]

cur.execute("SELECT content FROM Article WHERE category_id=4") # sport category
all_sport = cur.fetchmany(350)
all_sport = [all_sport[i][0] for i in range(len(all_sport))]

cur.execute("SELECT content FROM Article WHERE category_id=5") # tech category
all_tech = cur.fetchmany(350)
all_tech = [all_tech[i][0] for i in range(len(all_tech))]

conn.close()

In [4]:
# split the data into two pieces, one for training and one for testing (validation set) 
test_bus = all_business[300:]
train_bus = all_business[:300]

test_ent = all_entertainment[300:]
train_ent = all_entertainment[:300]

test_pol = all_politics[300:]
train_pol = all_politics[:300]

test_spr = all_sport[300:]
train_spr = all_sport[:300]

test_tch = all_tech[300:]
train_tch = all_tech[:300]
print(len(test_tch))
train_x = train_bus + train_ent + train_pol + train_spr + train_tch 
test_x = test_bus + test_ent + test_pol + test_spr + test_tch

train_y = np.append(np.ones((len(train_bus), 1)), 2*np.ones((len(train_ent), 1)), axis=0)
train_y = np.append(train_y, 3*np.ones((len(train_pol), 1)), axis=0)
train_y = np.append(train_y, 4*np.ones((len(train_spr), 1)), axis=0)
train_y = np.append(train_y, 5*np.ones((len(train_tch), 1)), axis=0)

test_y = np.append(np.ones((len(test_bus), 1)), 2*np.ones((len(test_ent), 1)), axis=0)
test_y = np.append(test_y, 3*np.ones((len(test_pol), 1)), axis=0)
test_y = np.append(test_y, 4*np.ones((len(test_spr), 1)), axis=0)
test_y = np.append(test_y, 5*np.ones((len(test_tch), 1)), axis=0)

33


In [5]:
# Print the shape train and test sets
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (1500, 1)
test_y.shape = (233, 1)


In [6]:
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 38963


In [7]:
# test the preprocessing function
print('This is an example of a business article: \n', train_x[0])
print('\nThis is an example of the processed version of the article: \n', process_article(train_x[0]))

This is an example of a business article: 
 Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to s

In [8]:
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    # Calculate the sigmoid of z
    h = 1/(1 + np.exp(-z))
    
    return h

In [9]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m, n+1)
        y: corresponding labels of the input matrix x, dimensions (m, 1)
        theta: weight vector of dimension (n+1, 1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
    # Get 'm', the number of rows in matrix x
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # Get z, the dot product of x and theta
        z = np.dot(x, theta)
        
        # Get the sigmoid of z
        h = sigmoid(z)
        
        # Calculate the cost function
        J = (-1./m) * (np.dot(y.T, np.log(h)) + np.dot((1-y).T, np.log(1-h)))

        # Update the weights theta
        theta = theta - (alpha/m)*(np.dot(x.T, (h-y)))
        
    J = float(J)
    return J, theta

In [10]:
def extract_features(article, freqs, num_labels):
    '''
    Input: 
        article: a list of words for one article
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        num_labels: number of categories
    Output:
        x: a feature vector of dimension (1, num_labels+1)
    '''
    # process_article tokenizes, stems, and removes stopwords
    word_l = process_article(article)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, num_labels+1)) 
    
    #bias term is set to 1
    x[0,0] = 1
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the business label 1
        x[0,1] += freqs.get((word, 1), 0)
        
        # increment the word count for the entertainment label 2
        x[0,2] += freqs.get((word, 2), 0)
        
        # increment the word count for the politics label 3
        x[0,3] += freqs.get((word, 3), 0)
        
        # increment the word count for the sport label 4
        x[0,4] += freqs.get((word, 4), 0)
        
        # increment the word count for the tech label 5
        x[0,5] += freqs.get((word, 5), 0)
        
    assert(x.shape == (1, num_labels+1))
    return x

In [11]:
def oneVsAll(X, y, num_labels, learning_rate, num_iter):
    all_theta = np.zeros((num_labels, num_labels+1))
    all_J = np.zeros((num_labels, 1))
    
    for c in range(1, num_labels+1):
        temp_J, temp_theta = gradientDescent(X, (Y==c), np.zeros((num_labels+1, 1)), learning_rate, num_iter)
        
        all_J[c-1, 0] = temp_J
        all_theta[c-1, :] = np.squeeze(temp_theta)
        
    return (all_J, all_theta)

In [12]:
# Collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 6))
for i in range(len(train_x)):
    X[i, :] = extract_features(train_x[i], freqs, 5)

# training labels corresponding to X
Y = train_y

# Apply gradient descent
J, theta = oneVsAll(X, Y, 5, 1e-9, 1500)
print(f"The costs after training is ", J)
print(f"The resulting matrix of weights is ", theta)

The costs after training is  [[0.15024914]
 [0.17839946]
 [0.1555125 ]
 [0.11904593]
 [0.13080761]]
The resulting matrix of weights is  [[-4.24621251e-08  5.34446703e-04 -2.44242869e-04 -1.05390548e-04
  -2.32662137e-04 -1.44344486e-04]
 [-1.89568030e-08 -1.56818655e-04  4.98474717e-04 -5.51410419e-05
  -2.09423430e-04 -1.34596644e-04]
 [-6.67037504e-08 -2.22835000e-04 -2.25295621e-04  4.50078565e-04
  -2.20152574e-04 -1.84292229e-04]
 [ 9.37159550e-09 -1.53409589e-04 -1.84699714e-04 -6.53871960e-05
   5.48781825e-04 -1.45497989e-04]
 [-8.50192179e-08 -2.20014681e-04 -2.43973098e-04 -1.71007979e-04
  -2.71888083e-04  4.89575345e-04]]


In [13]:
def predict_article(article, freqs, theta):
    '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (5,6) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''
    
    # Extract the features of the tweet and store it into x
    x = extract_features(article, freqs, 5)
    
    # Make the prediction using x and theta
    y_pred = sigmoid(np.dot(x, theta.T))
    
    return np.argmax(y_pred)+1

In [14]:
my_article = "Tory leader Michael Howard has gone on the offensive in response to people questioning how a son of immigrants can propose asylum quotas. Mr Howard, whose parents fled the Nazi threat to come to the UK, says the claim would mean no-one from an immigrant family could become premier. His comments come in a BBC documentary called 'No More Mr Nasty'. TV presenter Anne Robinson said as home secretary he gave the impression he would \"like to kick your cat\". Ms Robinson, a friend of the Tory leader, also revealed that as a Cambridge student Mr Howard was \"much loved by women and he was a courteous and kind and rather dashing lover\" - although she denied having personal experience. \"I wasn't at Cambridge - and it's not personal experience - but I know people who were.\" Documentary maker Michael Cockerell was given behind-the-scenes access to Mr Howard for his film portrait. The Tory leader was asked about to respond to people who said that if there had there been a quota on immigration and asylum in the 1930s, his parents might not have been allowed into the country. He replies: \"What is the inference of that? \"That if you reach the view that you need to control immigration in the interests of the country you're not allowed to put a view forward if you happen to be descended from immigrants? \"That seems to me an absolutely extraordinary proposition? It would certainly mean no one from immigrant parents could be prime minister.\" Ms Robinson, who presents The Weakest Link tells Cockerell that she despaired at his hardline image when he was home secretary in John Major's government. \"I used to have to sit on my hands because he'd get on television and give a passable impression of someone who'd like to kick your cat or would put your baby in prison if he cried. I mean it was very, very Draconian.\" The film shows Mr Howard laughing at Rory Bremner's impression of him as Dracula, which he calls \"good fun\", apart from the serious falsehood of a comment suggesting he wants fewer black people in the UK. The film shows the private side of the Tory leader watching television at home or playing table tennis with his wife, ex-model Sandra. Asked if she enjoys a game of ping pong she confesses: \"Yeah, it would be more enjoyable if I could win occasionally too, but otherwise it's quite fun.\" Former Downing Street communications chief Alastair Campbell, now working on Labour's election campaign, says a \"touchy-feely\" image does not fit Mr Howard. He says Tony Blair was not worried by his opponents' early performance in their Commons clashes because Mr Howard lacked a \"big strategy\", including on issues like Iraq. The Tory leader brands such criticisms as \"absolutely rubbish\", arguing that he has been consistent on his support for the war but critical of Mr Blair's failure to tell the truth on intelligence. Former Tory chancellor Ken Clarke says Mr Howard has a bigger problem changing perceptions of the Tory party than his personal image. Mr Clarke says the party is improving and it is \"conceivable\" it could win the next election. But he adds: \"It has got to change itself a bit and broaden its appeal.\" - Michael Howard: No More Mr Nasty is being shown on BBC2 on Saturday 12 February at 2005 GMT."
predict_article(my_article, freqs, theta)

3

In [15]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    """
    Input: 
        test_x: a list of articles
        test_y: (m, 1) vector with the corresponding labels for the list of articles
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        accuracy: (# of tweets classified correctly) / (total # of articles)
    """
    
    # The list for storing predictions
    y_hat = []
    
    for article in test_x:
        # Get the label prediction for the article
        y_pred = predict_article(article, freqs, theta)
        
        y_hat.append(y_pred)
        
    # With the above implementation, y_hat is a list, but test_y is (m,1) array
    # Convert both to one-dimensional arrays in order to compare them using the '==' operator
    accuracy = np.mean(np.asarray(y_hat) == np.squeeze(test_y))
    
    return accuracy

In [16]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9742


In [17]:
# Some error analysis done here
print('Label Predicted Article')
for x,y in zip(test_x,test_y):
    y_hat = predict_article(x, freqs, theta)

    if np.abs(y - y_hat) > 0:
        print('THE ARTICLE IS:\n', x)
        print('Label: %d\tGuess: %d\n\n' % (y, y_hat))

Label Predicted Article
THE ARTICLE IS:
 The Brazilian government has played down claims that it could step in to save the country's biggest airline.

Brazil's airport authority chief Carlos Wilson had claimed the government was on the brink of stepping in to save Varig, Brazil's flagship airline. However, the country's vice president Jose Alencar has said the government still is looking for a solution. Varig is struggling under a huge debt burden of an estimated debt of 6.5 billion reais ($2.3bn or Â£1.2bn). Asked whether a rescue was on the cards following a meeting of the country's Congress to discuss the airline's crisis, Mr Alencar replied: "No, I don't think so. We will see."

Earlier, Mr Wilson had said that president Luiz Inacio Lula da Silva has decided to step in and a decree of some kind of intervention could be signed this week. "In practice, it will be an intervention, although this is not the technical name used", he said. An intervention means that the government would t

THE ARTICLE IS:
 The net's self-declared spam king is seeking bankruptcy protection.

Scott Richter, the man behind OptInRealBig.com and billions of junk mail messages, said lawsuits had forced the company into Chapter 11. OptInRealBig was fighting several legal battles, most notably against Microsoft, which is pushing for millions of dollars in damages. The company said filing for Chapter 11 would help it try to resolve its legal problems but still keep trading.

Listed as the third biggest spammer in the world by junk mail watchdog Spamhaus, OptInRealBig was sued in December 2003 for sending mail messages that violated anti-spam laws. The lawsuit was brought by Microsoft and New York attorney general Eliot Spitzer who alleged that Mr Richter and his accomplices sent billions of spam messages through 514 compromised net addresses in 35 countries. According to Microsoft the messages were sent via net addresses owned by the Kuwait Ministries of Communication and Finance, several Korean 

In [20]:
my_article = '''
(CNN)The Oscar nominations reflect the breadth of quality work produced and released in what was by any measure a difficult, terrible year for the movie industry. Viewers can now flock to the streaming services of their choice to see whether they think the voters got it right.

Therein lies the challenge for the producers of this year's awards, after a stretch that has already seen the Emmys and Golden Globes sink to record-low ratings. It has left producers of the ceremony in what looks like a difficult bind -- with no theatrical blockbusters to help drive interest in the ceremony -- but also with an opportunity, if they embrace the freedom that should come with diminished viewership expectations and having little to lose.
In the past, the Academy of Motion Picture Arts and Sciences, which presents the awards, has collectively exhibited mixed feelings about Netflix's role in the movie industry's marquee event, unable to clearly decide if streaming was friend or foe. The effects of the pandemic -- and the related shift toward home viewing -- essentially pushed the Academy into Netflix's waiting arms.
All told, the service amassed a best-ever 35 nominations, more than any other single entity. That included two of the eight best picture nominees, a pair of historical dramas in "Mank" and "The Trial of the Chicago 7," and six of the 20 acting bids.
Netflix, of course, wasn't alone, with streamers like Amazon (with a dozen total), Hulu and Disney+ all leaving their mark on the roster of contenders. In some instances, that recognition came for movies that were redirected from planned theatrical release to streaming, such as Hulu's "Nomadland" and Disney+'s "Soul" and "Mulan."
What that means for the Oscars in practical terms is unclear. Ratings for the awards have been trending downward due to a host of factors, but the off-a-cliff decline for the Golden Globes has only fed the sense that the appetite for such events has been greatly diminished by a year that shut movie theaters and blunted the celebratory aspects of live events.
The Emmys, notably, charted a path for handing out awards despite coronavirus restrictions, a formula that the Globes largely botched. The Grammys recaptured some of the live experience with their performance-oriented ceremony on Sunday night.
The Oscars have also outlined plans for a more in-person format, but even with the awards delayed until late April, the kind of massive gathering staged in the past won't be possible. And while viewers can access most nominees online, the fragmentation of the audience -- and diminished rooting interest that comes with it -- has been exacerbated by everything that has transpired since "Parasite" made history at the 2020 Oscars, in what feels like much longer than 13 months ago.
Award shows, of course, exist for multiple reasons, which in their highest calling involves celebrating and encouraging admirable work. But they are also commercial endeavors, with practical implications built around getting people to watch, and broadcast fees that support the organizations behind them.
The Grammys seemingly found a sweet spot, creatively speaking, in a more intimate ceremony that still showcased the nominees, which could point the way in terms of a workable model. But if the Academy and host network ABC haven't already resigned themselves to disappointing numbers even relative to even last year's record-low Oscars, perhaps they should. (Ratings for the Grammys declined about 53% between 2020 and 2021.)
In a year where the Oscars have become mostly about streaming, there's not much left to do but go with the flow.
'''
pred = predict_article(my_article, freqs, theta)
cats = ['Placeholder', 'Business', 'Entertainment', 'Politics', 'Sport', 'Technology']
print(cats[pred])

Entertainment
