# 0.Introduction to Natual Language Processing & Set Up

### Step 1

In [1]:
import numpy as np

x = np.array([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14]])

### Step 2

In [2]:
np.average(x, axis=0)

array([5., 6., 7., 8., 9.])

In [3]:
np.average(x, axis=1)

array([ 2.,  7., 12.])

# 1.Get familiar with basic Python

### Step 0

In [4]:
1+32//4*4

33

### Step 2

In [5]:
def fib(n):
    if n == 1 or n == 2:
        return 1
    a, b = 1, 1
    for i in range(n-2):
        a, b = b, a + b
    return b

In [6]:
print(fib(101))

573147844013817084101


### Step 3

In [7]:
import json

In [8]:
categories = set()
docs = []

with open('News_Category_Dataset_v2.json') as f:
    lines = f.readlines()
    for line in lines:
        doc = json.loads(line)
        docs.append(doc)
        categories.add(doc['category'])

print(len(categories))

data = []

for doc in docs:
    data.append([doc['category'], doc['headline']])
    
cat2int = {k: v for v, k in enumerate(categories)}
int2cat = {k: v for v, k in cat2int.items()}

41


# 3.Vector representation of text

### Step 1

In [9]:
import gensim.downloader as api
word_vecs = api.load("word2vec-google-news-300")

In [10]:
import re
import numpy as np

def doc2vec(doc):
    doc = re.sub('[^a-zA-Z]', ' ', doc).split()
    vecs = [word_vecs.word_vec(word) for word in doc if word in word_vecs.vocab]
    if len(vecs) == 0:
        return np.array([])
    vecs = np.array(vecs)
    return np.mean(vecs, axis=0)

In [11]:
word_vecs.distance("man", "woman")

0.2335987687110901

### Step 2

In [12]:
sentence1 = "'National tragedy': Trump begins border wall construction in Unesco reserve."
sentence2 = "Trump administration enters new phase for border wall, sets ambitious timetable after securing land"

In [13]:
from scipy.spatial import distance

vector1 = doc2vec(sentence1)
vector2 = doc2vec(sentence2)

distance.cosine(vector1, vector2)

0.3698856234550476

# 4.Train DNN as classifier
### Step 1

In [14]:
sentences = []
labels = []

for i in range(len(data)):
    vec = doc2vec(data[i][1])
    if vec.size != 0:
        labels.append(cat2int[data[i][0]])
        sentences.append(vec)

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.1, random_state=1)

### Step 2

In [16]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(learning_rate_init=0.001, random_state=1, max_iter=100, alpha=0.001, 
                    batch_size=64, activation='relu', verbose=False, hidden_layer_sizes=(1024,), 
                    early_stopping=True, validation_fraction=0.1, n_iter_no_change=5).fit(X_train, y_train)

In [17]:
print('{:.3f}'.format(clf.score(X_test, y_test)))
print('{:.3f}'.format(clf.score(X_train, y_train)))

0.573
0.660


In [18]:
def predict(sentence):
    vec = doc2vec(sentence)
    label = clf.predict([vec])
    label = int2cat[int(label)]
    return label

# 5.Measure your model

### Step 0

In [19]:
test_truth = np.genfromtxt('test_truth.txt')
preds = test_truth[:, 0]
labels = test_truth[:, 1]

In [20]:
def accuracy(preds, labels):
    return np.sum(preds == labels) / len(preds)

In [21]:
accuracy(preds, labels)

0.8

### Step 1

In [22]:
preds_test = clf.predict(X_test)

### Step 2

In [23]:
print('{:.3f}'.format(accuracy(preds_test, y_test)))

0.573
