# *Supervised learning*

In [1]:
%%capture
# Include the line above to hide a cell's text output.

## Download sample 'sports' and 'world' articles sets, then unzip.

import os

os.chdir('/sharedfolder/')

!wget -N https://github.com/pcda17/pcda17.github.io/raw/master/week/11/nyt_world_11-19-2017.zip
!unzip -o nyt_world_11-19-2017.zip

!wget -N https://github.com/pcda17/pcda17.github.io/raw/master/week/11/nyt_sports_11-19-2017.zip
!unzip -o nyt_sports_11-19-2017.zip

In [2]:
## Loading 'world' articles as a list of strings

os.chdir('/sharedfolder/nyt_world_11-19-2017/')

nyt_world_texts = []

for filename in os.listdir('./'):
    text_data = open(filename).read().replace('\n', ' ')
    nyt_world_texts.append(text_data)

print(len(nyt_world_texts))

45


In [3]:
## Loading 'sports' articles as a list of strings

os.chdir('/sharedfolder/nyt_sports_11-19-2017/')

nyt_sports_texts = []

for filename in os.listdir('./'):
    text_data = open(filename).read().replace('\n', ' ')
    nyt_sports_texts.append(text_data)

print(len(nyt_sports_texts))

40


In [4]:
## Divide articles into training and test sets (reserving 8 articles for test set)

nyt_world_train = nyt_world_texts[:-8]
nyt_sports_train = nyt_sports_texts[:-8]

nyt_world_test = nyt_world_texts[-8:]
nyt_sports_test = nyt_sports_texts[-8:]

print('Training set lengths:')
print(len(nyt_world_train))
print(len(nyt_sports_train))

print() ## empty line

print('Test set lengths:')
print(len(nyt_world_test))
print(len(nyt_sports_test))

Training set lengths:
37
32

Test set lengths:
8
8


In [5]:
## Note that the '*' operator can be used to loop a list:

repeated_list = [0]*5

repeated_list

[0, 0, 0, 0, 0]

In [6]:
## And we use '+' to concatenate lists:

repeated_list_2 = [1]*10 + [0]*9 + ['j']*3

repeated_list_2

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'j', 'j', 'j']

In [7]:
## Combing training data

combined_texts = nyt_world_train + nyt_sports_train

## Creating list of associated class values: 
## 0 for 'world', 1 for 'sports'

y = [0]*len(nyt_world_train) + [1]*len(nyt_sports_train)

In [8]:
## Creating vectorized training set using our combined sentence list

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(combined_texts)

X.shape

(69, 6891)

In [9]:
## Training a multinomial naive Bayes classifier
## X is a combined list of 'world' and 'sports' vectors
## y is a list of classes (0 or 1)

from sklearn.naive_bayes import MultinomialNB

naive_bayes_classifier = MultinomialNB().fit(X, y)

In [10]:
## Classifying 'world' test set

input_vector = vectorizer.transform(nyt_world_test) ## Converting a list of string to vector format established above

naive_bayes_classifier.predict(input_vector) ## Classifying each article in the list. '0' is correct.

array([1, 0, 0, 0, 0, 0, 0, 0])

In [11]:
## Classifying 'sports' test set

input_vector = vectorizer.transform(nyt_sports_test) ## Converting a list of string to vector format established above

naive_bayes_classifier.predict(input_vector) ## Classifying each article in the list. '1' is correct.

## We'll continue using this set of vectors in the cells below.

array([0, 1, 0, 1, 0, 1, 1, 1])

In [12]:
## k-nearest neighbor classifier

from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5)

knn_classifier.fit(X, y)

knn_classifier.predict(input_vector)

array([0, 0, 0, 1, 0, 0, 0, 0])

In [13]:
## Logistic Regression classifier

from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression()

lr_classifier.fit(X, y)

lr_classifier.predict(input_vector)

array([0, 1, 0, 1, 0, 1, 1, 0])

In [14]:
## Support Vector Machine (SVM) classifer

from sklearn.svm import SVC

svm_classifier = SVC(kernel='linear')

svm_classifier.fit(X, y)

svm_classifier.predict(input_vector)

array([0, 1, 0, 1, 0, 0, 0, 0])

In [15]:
## Multi-layer perceptron classifier (a shallow neural network)

from sklearn.neural_network import MLPClassifier

mlp_classifier = MLPClassifier()

mlp_classifier.fit(X, y)

mlp_classifier.predict(input_vector)

array([0, 0, 1, 1, 0, 1, 1, 1])

### *Assignment *
    
    Write some code that downloads a live New York Times page and classifies it as 'world' or 'sports'.

In [16]:
%%capture

!apt-get -y install libxml2-dev libxslt-dev 
!pip3 install newspaper3k

In [17]:
## Using the 'newspaper' package to extract article text

from newspaper import Article

def url_to_article_text(url):
    article = Article(url)
    article.download()
    article.parse()
    article_text = article.text.replace('\n', ' ')
    return article_text

In [18]:
url = 'https://www.nytimes.com/2017/11/19/sports/patriots-beat-raiders-mexico.html'

article_text = url_to_article_text(url)

article_text[:2000]

'It gave a jolt of momentum to the Patriots (8-2), as if they needed it in a blowout that laid bare Oakland’s anemic defense. Oakland, trailing by 30-0, finally scored in the fourth quarter, when Amari Cooper scored a touchdown on a pass from Derek Carr.  Photo  The win improved the Patriots chances for the top seed in the A.F.C. playoffs. As the game wound down, thick storm clouds menaced, mirroring the increasingly gloomy postseason outlook for Oakland (4-6).  No matter.  The announced crowd of 77,000 treated this event like a Super Bowl nearly the entire time: The fans’ excitement was palpable; their face paint, jerseys and carousing inside and outside the stadium a testament to their anticipation for this game.  “If I were in Oakland, I would care more if they won or lost, but just that they are playing here, I cannot describe how that feels,” said Emilio Carreño, 28, a government accountant and Raiders fan. “To see the Raiders here in Azteca is something I will remember for a long

### *Solution*

In [19]:
## Classifying a list of articles 5 ways

urls = ['https://www.nytimes.com/2017/11/19/sports/patriots-beat-raiders-mexico.html', 
        'https://www.nytimes.com/2017/11/21/world/middleeast/syria-damascus-war.html',
        'https://www.washingtonpost.com/world/asia_pacific/with-technology-these-researchers-are-figuring-out-north-koreas-nuclear-secrets/2017/11/20/274d9786-c9e2-11e7-b244-2d22ac912500_story.html',
        'https://www.washingtonpost.com/news/football-insider/wp/2017/11/20/we-have-so-many-question-marks-redskins-starters-ailing-at-safety-on-both-lines/',
        'https://www.nytimes.com/2017/11/21/movies/mr-roosevelt-review-noel-wells.html',
        'https://www.washingtonpost.com/entertainment/music/silly-humor-of-dated-plot-underlines-joy-of-opera/2017/11/20/c5521c3c-ce1e-11e7-9d3a-bcbe2af58c3a_story.html']

for url in urls:
    article_text = url_to_article_text(url)     ## Get the article at a given URL as a string.
    input_vector = vectorizer.transform([article_text])[0] ## Vectorize article.
    print(url)                        # ^ Enclosing the string in a list, because that's what vectorizer.transform() expects. 
    print('naive Bayes: ' + str(naive_bayes_classifier.predict(input_vector)))
    print('k-nearest neighbors: ' + str(knn_classifier.predict(input_vector)))
    print('logistic regression: ' + str(lr_classifier.predict(input_vector)))
    print('support vector machine: ' + str(svm_classifier.predict(input_vector)))
    print('multi-layer perceptron classifier: ' + str(mlp_classifier.predict(input_vector)))
    print()

## Recall that '0' = 'world' and '1' = 'sports'
    

https://www.nytimes.com/2017/11/19/sports/patriots-beat-raiders-mexico.html
naive Bayes: [1]
k-nearest neighbors: [0]
logistic regression: [1]
support vector machine: [1]
multi-layer perceptron classifier: [1]

https://www.nytimes.com/2017/11/21/world/middleeast/syria-damascus-war.html
naive Bayes: [0]
k-nearest neighbors: [0]
logistic regression: [0]
support vector machine: [0]
multi-layer perceptron classifier: [0]

https://www.washingtonpost.com/world/asia_pacific/with-technology-these-researchers-are-figuring-out-north-koreas-nuclear-secrets/2017/11/20/274d9786-c9e2-11e7-b244-2d22ac912500_story.html
naive Bayes: [0]
k-nearest neighbors: [0]
logistic regression: [0]
support vector machine: [1]
multi-layer perceptron classifier: [0]

https://www.washingtonpost.com/news/football-insider/wp/2017/11/20/we-have-so-many-question-marks-redskins-starters-ailing-at-safety-on-both-lines/
naive Bayes: [1]
k-nearest neighbors: [1]
logistic regression: [1]
support vector machine: [1]
multi-layer

In [20]:
## Classifying a list of articles 5 ways

## If we define a list of classes (i.e., ['world', 'sports']), we can display class names instead of 0s and 1s.

urls = ['https://www.nytimes.com/2017/11/19/sports/patriots-beat-raiders-mexico.html', 
        'https://www.nytimes.com/2017/11/21/world/middleeast/syria-damascus-war.html',
        'https://www.washingtonpost.com/world/asia_pacific/with-technology-these-researchers-are-figuring-out-north-koreas-nuclear-secrets/2017/11/20/274d9786-c9e2-11e7-b244-2d22ac912500_story.html',
        'https://www.washingtonpost.com/news/football-insider/wp/2017/11/20/we-have-so-many-question-marks-redskins-starters-ailing-at-safety-on-both-lines/',
        'https://www.nytimes.com/2017/11/21/movies/mr-roosevelt-review-noel-wells.html',
        'https://www.washingtonpost.com/entertainment/music/silly-humor-of-dated-plot-underlines-joy-of-opera/2017/11/20/c5521c3c-ce1e-11e7-9d3a-bcbe2af58c3a_story.html']

for url in urls:
    article_text = url_to_article_text(url)     ## Get the article at a given URL as a string.
    input_vector = vectorizer.transform([article_text])[0] ## Vectorize article.
    classes = ['world', 'sports']
    print(url)                        # ^ Enclosing the string in a list, because that's what vectorizer.transform() expects. 
    print('naive Bayes: ' + str(classes[naive_bayes_classifier.predict(input_vector)[0]]))
    print('k-nearest neighbors: ' + str(classes[knn_classifier.predict(input_vector)[0]]))
    print('logistic regression: ' + str(classes[lr_classifier.predict(input_vector)[0]]))
    print('support vector machine: ' + str(classes[svm_classifier.predict(input_vector)[0]]))
    print('multi-layer perceptron classifier: ' + str(classes[mlp_classifier.predict(input_vector)[0]]))
    print()

https://www.nytimes.com/2017/11/19/sports/patriots-beat-raiders-mexico.html
naive Bayes: sports
k-nearest neighbors: world
logistic regression: sports
support vector machine: sports
multi-layer perceptron classifier: sports

https://www.nytimes.com/2017/11/21/world/middleeast/syria-damascus-war.html
naive Bayes: world
k-nearest neighbors: world
logistic regression: world
support vector machine: world
multi-layer perceptron classifier: world

https://www.washingtonpost.com/world/asia_pacific/with-technology-these-researchers-are-figuring-out-north-koreas-nuclear-secrets/2017/11/20/274d9786-c9e2-11e7-b244-2d22ac912500_story.html
naive Bayes: world
k-nearest neighbors: world
logistic regression: world
support vector machine: sports
multi-layer perceptron classifier: world

https://www.washingtonpost.com/news/football-insider/wp/2017/11/20/we-have-so-many-question-marks-redskins-starters-ailing-at-safety-on-both-lines/
naive Bayes: sports
k-nearest neighbors: sports
logistic regression: sp