In [None]:
!gcloud services enable ml.googleapis.com
!gcloud services enable compute.googleapis.com

In [123]:
import tensorflow as tf 
import pandas as pd
import numpy as np 

from google.cloud import bigquery
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing import text

client = bigquery.Client()

In [156]:
# Query Stack Overflow post data from BigQuery
sql = """
SELECT
    posts.body AS body,
    posts.tags AS tags
FROM
    `bigquery-public-data.stackoverflow.stackoverflow_posts` AS posts
LIMIT 10000
"""
df = client.query(sql).to_dataframe()

# Data Pre-processing

In [157]:
# Clean up dataframe
df.dropna(inplace=True)
df['body'].replace(regex=True, inplace=True, to_replace=r'\<[^\>]+\>', value=r'')
df['tags'] = df['tags'].str.split('|', expand=False)

In [158]:
df.head()

Unnamed: 0,body,tags
0,When you are somewhere between version 1 and v...,"[maintenance, patch, servicepacks, hotfix]"
1,I'm creating an ASP.NET MVC site and I need to...,"[asp.net-mvc, login, asp.net-membership, membe..."
2,We get The communication object System.Servi...,"[c#, wcf]"
3,i have the basic webserver hello world app for...,"[porting, node.js, forward-compatibility]"
4,I have an AdjacencyGraph&lt;string Edge&lt;str...,"[c#, algorithm, shortest-path, quickgraph]"


In [159]:
# Find the k most common tags
k = 5
all_tags = sum(df.tags, [])

from collections import Counter
c = Counter(all_tags)
top_tags = [key for key, val in c.most_common(k) if key != '']
num_tags = len(top_tags)
top_tags

['javascript', 'c#', 'java', 'php']

In [160]:
# Remove all except top <num_tags> tags
df['tags'] = df['tags'].apply(lambda row: [val for val in row if val in top_tags])
df = df[df['tags'].map(lambda row: len(row) > 0)]

In [161]:
df.head()

Unnamed: 0,body,tags
2,We get The communication object System.Servi...,[c#]
4,I have an AdjacencyGraph&lt;string Edge&lt;str...,[c#]
6,I have a DataGrid in my Silverlight applicatio...,[c#]
7,I'm implementing a secure WCF service. Authent...,[c#]
15,If I have a list like this: &lt;ul id= mylist ...,[javascript]


In [162]:
# Create one-hot encoding from tags column
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('tags')), 
                          columns=mlb.classes_, 
                          index=df.index))

In [163]:
df.head()

Unnamed: 0,body,c#,java,javascript,php
2,We get The communication object System.Servi...,1,0,0,0
4,I have an AdjacencyGraph&lt;string Edge&lt;str...,1,0,0,0
6,I have a DataGrid in my Silverlight applicatio...,1,0,0,0
7,I'm implementing a secure WCF service. Authent...,1,0,0,0
15,If I have a list like this: &lt;ul id= mylist ...,0,0,1,0


In [164]:
# Split data into train/test
df_train, df_test = train_test_split(df, test_size=0.2)

In [165]:
# Tokenize body text
VOCAB_SIZE = 800

tokenizer = text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df_train['body'])

body_train = tokenizer.texts_to_matrix(df_train['body'])
body_test = tokenizer.texts_to_matrix(df_test['body'])

# The Model

In [166]:
def create_model(vocab_size, num_tags):
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Dense(50, input_shape=(vocab_size,), activation='relu'))
  model.add(tf.keras.layers.Dense(25, activation='relu'))
  model.add(tf.keras.layers.Dense(num_tags, activation='sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [167]:
model = create_model(VOCAB_SIZE, num_tags)
model.summary()

# Train and evaluate the model
model.fit(body_train, df_train[top_tags].as_matrix(), epochs=10, batch_size=128, validation_split=0.1)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 50)                40050     
_________________________________________________________________
dense_20 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_21 (Dense)             (None, 4)                 104       
Total params: 41,429
Trainable params: 41,429
Non-trainable params: 0
_________________________________________________________________
Train on 1605 samples, validate on 179 samples
Epoch 1/10


  """


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4478364320>

In [168]:
results = model.evaluate(body_test, df_test[top_tags].as_matrix(), verbose=0)

print("Test Score: ", results[0])
print("Test Accuracy: ", results[1])

Test Score:  0.2742780904064264
Test Accuracy:  0.8985426


  """Entry point for launching an IPython kernel.
